In [1]:
import logging
import os
import asyncio

from ogmyrag.report_retrieval.report_retrieval import ReportRetrievalManager
from ogmyrag.report_retrieval.retrieval_embedder import RetrievalEmbedder
from ogmyrag.report_retrieval.retrieval_extractor import RetrievalExtractor
from ogmyrag.report_retrieval.retrieval_storage import RetrievalAsyncStorageManager
from ogmyrag.report_scraper.models import ReportType
from ogmyrag.my_logging import configure_logger

from dotenv import load_dotenv

retrieval_logger = configure_logger(name='retrieval',log_level=logging.INFO, log_file='logs/retrieval.log')
retrieval_logger.info("\n" + "=" * 80)

load_dotenv(override=True)

mongo_db_uri = os.getenv("MONGO_DB_URI_JJ","")
pinecone_api_key = os.getenv("PINECONE_API_KEY_JJ","")
genai_api_key = os.getenv("GENAI_API_KEY_JJ","")
openai_api_key = os.getenv("OPENAI_API_KEY", "")

INDEX_NAME = "markdown-summary-index"
EMBED_MODEL = "text-embedding-3-small"
DIMENSION = 1536
#GENAI_MODEL = "gemini-2.5-pro"
GENAI_MODEL = "gemini-2.5-flash"
OPENAI_MODEL = "gpt-4.1-nano-2025-04-14"

db_name = "FYP"
storage = RetrievalAsyncStorageManager(mongo_uri=mongo_db_uri, db_name=db_name)

embedder = RetrievalEmbedder(
    openai_api_key = openai_api_key,
    pinecone_api_key = pinecone_api_key,
    embed_model = EMBED_MODEL,
    index_name = INDEX_NAME,
    dimension = DIMENSION
)

extractor = RetrievalExtractor(
    openai_api_key = openai_api_key,
    pinecone_api_key = pinecone_api_key,
    embed_model = EMBED_MODEL,
    index_name = INDEX_NAME
)

manager = ReportRetrievalManager(
    storage = storage,
    embedder = embedder,
    extractor = extractor,
    genai_model = GENAI_MODEL,
    genai_api_key = genai_api_key,
    openai_api_key = openai_api_key,
    #dry_run = False
)

2025-08-07 13:51:01,949 - retrieval - INFO - 
2025-08-07 13:51:01,958 - retrieval - INFO - Connected to MongoDB database: FYP


## Process Financial Reports (PDF)

In [2]:
await manager.parse_report(
    company = "VETECE_HOLDINGS_BERHAD",
    report_type = ReportType.IPO
)

2025-08-07 13:49:54,955 - retrieval - INFO - Already processed and up to date.
2025-08-07 13:49:54,956 - retrieval - INFO - Uploaded 0 PDFs
2025-08-07 13:49:54,986 - retrieval - INFO - Definitions already exist, skipping extraction.
2025-08-07 13:49:55,006 - retrieval - INFO - Table of Contents already exists, skipping extraction.
2025-08-07 13:49:55,030 - retrieval - INFO - Sections to extract: ['1. CORPORATE DIRECTORY', '2. APPROVALS AND CONDITIONS', '3. PROSPECTUS SUMMARY', '4. DETAILS OF OUR IPO', '5. INFORMATION ON OUR PROMOTERS, SUBSTANTIAL SHAREHOLDER, DIRECTORS AND KEY SENIOR MANAGEMENT', '6. INFORMATION ON OUR GROUP', '7. BUSINESS OVERVIEW', '8. INDEPENDENT MARKET RESEARCH REPORT', '9. RISK FACTORS', '10. RELATED PARTY TRANSACTIONS', '11. CONFLICT OF INTEREST', '12. FINANCIAL INFORMATION', '13. REPORTING ACCOUNTANTS’ REPORT ON THE PRO FORMA COMBINED STATEMENTS OF FINANCIAL POSITION', '14. ACCOUNTANTS’ REPORT', '15. ADDITIONAL INFORMATION', '16. SUMMARISED PROCEDURES FOR APPLIC

In [2]:
await manager.parse_report(
    company = "CABNET_HOLDINGS_BERHAD",
    report_type = ReportType.IPO
)

2025-08-07 13:51:16,882 - retrieval - INFO - Already processed and up to date.
2025-08-07 13:51:16,884 - retrieval - INFO - Uploaded 0 PDFs
2025-08-07 13:51:16,909 - retrieval - INFO - Definitions already exist, skipping extraction.
2025-08-07 13:51:16,932 - retrieval - INFO - Table of Contents already exists, skipping extraction.
2025-08-07 13:51:16,955 - retrieval - INFO - Sections to extract: ['1. CORPORATE DIRECTORY', '2. INFORMATION SUMMARY', '3. DETAILS OF OUR IPO', '4. RISK FACTORS', '5. INFORMATION OF OUR GROUP', '6. BUSINESS OVERVIEW', '7. EXECUTIVE SUMMARY OF THE INDEPENDENT MARKET RESEARCH REPORT', '8. INFORMATION ON OUR PROMOTERS, SUBSTANTIAL SHAREHOLDERS, DIRECTORS AND KEY MANAGEMENT', '9. APPROVALS AND CONDITIONS', '10. RELATED PARTY TRANSACTIONS AND CONFLICT OF INTEREST', '11. FINANCIAL INFORMATION', "12. ACCOUNTANTS' REPORT", "13. DIRECTORS' REPORT", '14. STATUTORY AND OTHER GENERAL INFORMATION', '15. PROCEDURE FOR APPLICATION AND ACCEPTANCE', '16. LIST OF ADAS']
2025-0

In [5]:
await manager.parse_report(
    company = "AUTOCOUNT_DOTCOM_BERHAD",
    report_type = ReportType.IPO
)

2025-08-07 09:43:02,166 - retrieval - INFO - Processed report found.
2025-08-07 09:43:02,296 - retrieval - INFO - Saved processed report to ./processed_report/AUTOCOUNT_DOTCOM_BERHAD/AUTOCOUNT_DOTCOM_BERHAD_IPO.md


## Simple User Query

In [4]:
manager.answer_query(
    company = "FARM_FRESH_BERHAD",
    query = "What is the mission and vision?",
    top_k = 5,
    chat_model = OPENAI_MODEL
)

2025-08-03 11:40:56,916 - retrieval - INFO - Embedding query: 'What is the mission and vision?'
2025-08-03 11:40:58,284 - retrieval - INFO - Retrieved chunks: ['**Vision & mission statement:**', '*   **Our Promise (Vision):** "TO BE A SUSTAINABLE AND HONEST FOOD COMPANY THAT HAS AN INGRAINED CULTURE OF PLACING THE WELL-BEING OF CONSUMERS FIRST, CULMINATING IN STRONG BRAND LOVE." (pg 12)', '*   **Our Action Plan (Mission):** Farm Fresh achieves its brand promise by:', "**Business strategy and outlook:** The Group's strategy is centered on three core pillars: Expansion, Innovation, and Reputation. (pg 62-63)", '## Leadership & Governance']
2025-08-03 11:40:59,134 - retrieval - INFO - Response generated: The vision of the company is to be a sustainable and honest food company that has an ingrained culture of placing the well-being of consumers first, culminating in strong brand love.

The mission focuses on achieving this promise by implementing specific actions, although the detailed act

"The vision of the company is to be a sustainable and honest food company that has an ingrained culture of placing the well-being of consumers first, culminating in strong brand love.\n\nThe mission focuses on achieving this promise by implementing specific actions, although the detailed action plan is outlined in the company's strategy document."