In [1]:
import logging
import os
import asyncio

from ogmyrag.report_retrieval.report_retrieval import ReportRetrievalManager
from ogmyrag.report_retrieval.retrieval_embedder import RetrievalEmbedder
from ogmyrag.report_retrieval.retrieval_extractor import RetrievalExtractor
from ogmyrag.report_retrieval.retrieval_storage import RetrievalAsyncStorageManager
from ogmyrag.report_scraper.models import ReportType
from ogmyrag.my_logging import configure_logger

from dotenv import load_dotenv

retrieval_logger = configure_logger(name='retrieval',log_level=logging.INFO, log_file='logs/retrieval.log')
retrieval_logger.info("\n" + "=" * 80)

load_dotenv(override=True)

mongo_db_uri = os.getenv("MONGO_DB_URI_JJ","")
pinecone_api_key = os.getenv("PINECONE_API_KEY_JJ","")
genai_api_key = os.getenv("GENAI_API_KEY_JJ","")
openai_api_key = os.getenv("OPENAI_API_KEY", "")

INDEX_NAME = "markdown-summary-index"
EMBED_MODEL = "text-embedding-3-small"
DIMENSION = 1536
GENAI_MODEL = "gemini-2.5-pro"
#GENAI_MODEL = "gemini-2.5-flash"
OPENAI_MODEL = "gpt-4.1-nano-2025-04-14"

db_name = "FYP"
storage = RetrievalAsyncStorageManager(mongo_uri=mongo_db_uri, db_name=db_name)

embedder = RetrievalEmbedder(
    openai_api_key = openai_api_key,
    pinecone_api_key = pinecone_api_key,
    embed_model = EMBED_MODEL,
    index_name = INDEX_NAME,
    dimension = DIMENSION
)

extractor = RetrievalExtractor(
    openai_api_key = openai_api_key,
    pinecone_api_key = pinecone_api_key,
    embed_model = EMBED_MODEL,
    index_name = INDEX_NAME
)

manager = ReportRetrievalManager(
    storage = storage,
    embedder = embedder,
    extractor = extractor,
    genai_model = GENAI_MODEL,
    genai_api_key = genai_api_key,
    openai_api_key = openai_api_key,
    #dry_run = False
)

2025-08-21 13:35:37,517 - retrieval - INFO - 
2025-08-21 13:35:37,526 - retrieval - INFO - Connected to MongoDB database: FYP


## Process Financial Reports (PDF)

In [2]:
await manager.parse_report(
    company = "EDELTEQ HOLDINGS BERHAD",
    report_type = ReportType.IPO,
    #forced_process = True
)

2025-08-21 13:35:40,058 - retrieval - INFO - Already processed and up to date.
2025-08-21 13:35:40,059 - retrieval - INFO - Skipping processing, using existing content.
2025-08-21 13:35:40,060 - retrieval - INFO - Extracting all the processed content.
2025-08-21 13:35:40,192 - retrieval - INFO - Combining all the processed content.
2025-08-21 13:35:40,194 - retrieval - INFO - Processed content ready.


ValueError: too many values to unpack (expected 2)

In [2]:
await manager.parse_report(
    company = "VETECE_HOLDINGS_BERHAD",
    report_type = ReportType.IPO,
    forced_process = True
)

2025-08-21 12:09:04,515 - retrieval - INFO - Fresh processing of all docs.
2025-08-21 12:09:04,516 - retrieval - INFO -      Uploading 01._VETECE_Holdings_Berhad_-_Prospectus_(Part_1).pdf ...
2025-08-21 12:09:04,749 - retrieval - INFO -      Uploading 02._VETECE_Holdings_Berhad_-_Prospectus_(Part_2).pdf ...
2025-08-21 12:09:04,892 - retrieval - INFO -      Uploading 03._VETECE_Holdings_Berhad_-_Detailed_Procedures_for_Application.pdf ...
2025-08-21 12:09:12,672 - retrieval - INFO - Uploaded 3 PDFs
2025-08-21 12:09:12,674 - retrieval - INFO - Fresh processing mode, extracting definitions and TOC.
2025-08-21 12:09:12,695 - retrieval - INFO - Definitions already exist, skipping extraction.
2025-08-21 12:09:12,716 - retrieval - INFO - Table of Contents already exists, skipping extraction.
2025-08-21 12:09:12,740 - retrieval - INFO - Sections to extract: ['1. CORPORATE DIRECTORY', '2. APPROVALS AND CONDITIONS', '3. PROSPECTUS SUMMARY', '4. DETAILS OF OUR IPO', '5. INFORMATION ON OUR PROMOTE

In [None]:
await manager.parse_report(
    company = "CABNET_HOLDINGS_BERHAD",
    report_type = ReportType.IPO,
    forced_process = True
)

In [5]:
await manager.parse_report(
    company = "AUTOCOUNT_DOTCOM_BERHAD",
    report_type = ReportType.IPO
)

2025-08-16 16:29:23,601 - retrieval - INFO - Already processed and up to date.
2025-08-16 16:29:23,602 - retrieval - INFO - Skipping processing, using existing content.
2025-08-16 16:29:23,719 - retrieval - INFO - Dry run enabled, skipping chunk upsert.
2025-08-16 16:29:23,723 - retrieval - INFO - Saved processed report to ./processed_report/AUTOCOUNT_DOTCOM_BERHAD/AUTOCOUNT_DOTCOM_BERHAD_IPO.md


## Simple User Query

In [4]:
manager.answer_query(
    company = "FARM_FRESH_BERHAD",
    query = "What is the mission and vision?",
    top_k = 5,
    chat_model = OPENAI_MODEL
)

2025-08-03 11:40:56,916 - retrieval - INFO - Embedding query: 'What is the mission and vision?'
2025-08-03 11:40:58,284 - retrieval - INFO - Retrieved chunks: ['**Vision & mission statement:**', '*   **Our Promise (Vision):** "TO BE A SUSTAINABLE AND HONEST FOOD COMPANY THAT HAS AN INGRAINED CULTURE OF PLACING THE WELL-BEING OF CONSUMERS FIRST, CULMINATING IN STRONG BRAND LOVE." (pg 12)', '*   **Our Action Plan (Mission):** Farm Fresh achieves its brand promise by:', "**Business strategy and outlook:** The Group's strategy is centered on three core pillars: Expansion, Innovation, and Reputation. (pg 62-63)", '## Leadership & Governance']
2025-08-03 11:40:59,134 - retrieval - INFO - Response generated: The vision of the company is to be a sustainable and honest food company that has an ingrained culture of placing the well-being of consumers first, culminating in strong brand love.

The mission focuses on achieving this promise by implementing specific actions, although the detailed act

"The vision of the company is to be a sustainable and honest food company that has an ingrained culture of placing the well-being of consumers first, culminating in strong brand love.\n\nThe mission focuses on achieving this promise by implementing specific actions, although the detailed action plan is outlined in the company's strategy document."