In [1]:
import logging
import os
import asyncio

from ogmyrag.report_retrieval.report_retrieval import ReportRetrievalManager
from ogmyrag.report_retrieval.retrieval_storage import RetrievalAsyncStorageManager
from motor.motor_asyncio import AsyncIOMotorClient
from ogmyrag.base import MongoStorageConfig, PineconeStorageConfig
from ogmyrag.report_scraper.models import ReportType
from ogmyrag.my_logging import configure_logger
from ogmyrag.storage import PineconeStorage
from ogmyrag.report_retrieval.report_chunker import rag_answer_with_company_detection

from dotenv import load_dotenv

retrieval_logger = configure_logger(name='retrieval',log_level=logging.INFO, log_file='logs/retrieval.log')
retrieval_logger.info("\n" + "=" * 80)

load_dotenv(override=True)

mongo_db_uri = os.getenv("MONGO_DB_URI_JJ","")
pinecone_api_key = os.getenv("PINECONE_API_KEY_JJ","")
genai_api_key = os.getenv("GENAI_API_KEY_JJ","")
openai_api_key = os.getenv("OPENAI_API_KEY", "")

INDEX_NAME = "company-disclosures-index"
EMBED_MODEL = "text-embedding-3-small"
DIMENSION = 1536
GENAI_MODEL = "gemini-2.5-pro"
#GENAI_MODEL = "gemini-2.5-flash"
#GENAI_MODEL = "gemini-2.5-pro-preview-06-05"
OPENAI_MODEL = "gpt-5-nano"

db_name = "FYP"

mongo_storage_config: MongoStorageConfig = {
    "database_name": db_name
}

mongo_client = AsyncIOMotorClient(
    mongo_db_uri,
    serverSelectionTimeoutMS=5000
)

storage = RetrievalAsyncStorageManager(mongo_client, mongo_storage_config)


pinecone_config: PineconeStorageConfig = {
    "index_name": INDEX_NAME,
    "pinecone_api_key": pinecone_api_key,
    "pinecone_environment": "us-east-1",
    "pinecone_cloud": "aws",
    "pinecone_metric": "cosine",
    "pinecone_dimensions": DIMENSION,
    "openai_api_key": openai_api_key,
}

pine = PineconeStorage(
    pinecone_api_key = pinecone_api_key,
    openai_api_key = openai_api_key
)

pine.create_index_if_not_exists(
    index_name = pinecone_config["index_name"],
    dimension = pinecone_config["pinecone_dimensions"],
    metric = pinecone_config["pinecone_metric"],
    cloud = pinecone_config["pinecone_cloud"],
    region = pinecone_config["pinecone_environment"],
)

manager = ReportRetrievalManager(
    storage = storage,
    pine = pine,
    pinecone_config = pinecone_config,
    genai_model = GENAI_MODEL,
    genai_api_key = genai_api_key,
    openai_api_key = openai_api_key,
    #dry_run = False
)

2025-09-10 17:29:07,807 - retrieval - INFO - 
2025-09-10 17:29:07,821 - retrieval - INFO - Connected to MongoDB database: FYP


# Process Financial Reports (PDF)

In [None]:
companies = [
    "EDELTEQ_HOLDINGS_BERHAD",
    "AUTOCOUNT_DOTCOM_BERHAD",
    "ICT_ZONE_ASIA_BERHAD",
    "CABNET_HOLDINGS_BERHAD",
    "VETECE_HOLDINGS_BERHAD",

    "SFP_TECH_HOLDINGS_BERHAD",
    "TT_VISION_HOLDINGS_BERHAD",
    "RAMSSOL_GROUP_BERHAD",
    "I-STONE_GROUP_BERHAD", # same as AIMFLEX BERHAD
    "ECA_INTEGRATED_SOLUTION_BERHAD"
]

# SFP TECH HOLDINGS BERHAD
- IPO
- ANNUAL 2024
- ANNUAL 2023
- ANNUAL 2022

In [9]:
await manager.parse_report(
    company = "SFP_TECH_HOLDINGS_BERHAD",
    report_type = ReportType.IPO,
    #forced_process = True
)

2025-09-08 22:48:14,335 - retrieval - INFO - Already processed and up to date.
2025-09-08 22:48:14,336 - retrieval - INFO - Skipping processing, using existing content.
2025-09-08 22:48:14,336 - retrieval - INFO - Extracting all the processed content.
2025-09-08 22:48:14,465 - retrieval - INFO - Combining all the processed content.
2025-09-08 22:48:14,466 - retrieval - INFO - Processed content ready.
2025-09-08 22:48:14,499 - retrieval - INFO - Saved processed report to ./processed_report/SFP_TECH_HOLDINGS_BERHAD/SFP_TECH_HOLDINGS_BERHAD_IPO.md


In [27]:
await manager.parse_report(
    company = "SFP_TECH_HOLDINGS_BERHAD",
    report_type = ReportType.ANNUAL,
    year = 2024,
    #forced_process = True
)

2025-09-09 07:34:58,398 - retrieval - INFO - Already processed and up to date.
2025-09-09 07:34:58,398 - retrieval - INFO - Skipping processing, using existing content.
2025-09-09 07:34:58,398 - retrieval - INFO - Extracting all the processed content.
2025-09-09 07:34:58,509 - retrieval - INFO - Combining all the processed content.
2025-09-09 07:34:58,510 - retrieval - INFO - Processed content ready.
2025-09-09 07:34:58,533 - retrieval - INFO - Saved processed report to ./processed_report/SFP_TECH_HOLDINGS_BERHAD/SFP_TECH_HOLDINGS_BERHAD_ANNUAL_2024.md


In [26]:
await manager.parse_report(
    company = "SFP_TECH_HOLDINGS_BERHAD",
    report_type = ReportType.ANNUAL,
    year = 2023,
    #forced_process = True
)

2025-09-09 07:34:56,330 - retrieval - INFO - Already processed and up to date.
2025-09-09 07:34:56,331 - retrieval - INFO - Skipping processing, using existing content.
2025-09-09 07:34:56,331 - retrieval - INFO - Extracting all the processed content.
2025-09-09 07:34:56,444 - retrieval - INFO - Combining all the processed content.
2025-09-09 07:34:56,444 - retrieval - INFO - Processed content ready.
2025-09-09 07:34:56,469 - retrieval - INFO - Saved processed report to ./processed_report/SFP_TECH_HOLDINGS_BERHAD/SFP_TECH_HOLDINGS_BERHAD_ANNUAL_2023.md


In [25]:
await manager.parse_report(
    company = "SFP_TECH_HOLDINGS_BERHAD",
    report_type = ReportType.ANNUAL,
    year = 2022,
    #forced_process = True
)

2025-09-09 07:34:54,686 - retrieval - INFO - Already processed and up to date.
2025-09-09 07:34:54,686 - retrieval - INFO - Skipping processing, using existing content.
2025-09-09 07:34:54,687 - retrieval - INFO - Extracting all the processed content.
2025-09-09 07:34:54,801 - retrieval - INFO - Combining all the processed content.
2025-09-09 07:34:54,801 - retrieval - INFO - Processed content ready.
2025-09-09 07:34:54,827 - retrieval - INFO - Saved processed report to ./processed_report/SFP_TECH_HOLDINGS_BERHAD/SFP_TECH_HOLDINGS_BERHAD_ANNUAL_2022.md


# TT VISION HOLDINGS BERHAD
- IPO
- ANNUAL 2024
- ANNUAL 2023
- ANNUAL 2022

In [None]:
await manager.parse_report(
    company = "TT_VISION_HOLDINGS_BERHAD",
    report_type = ReportType.IPO,
    #forced_process = True
)

2025-09-08 22:48:12,462 - retrieval - INFO - Already processed and up to date.
2025-09-08 22:48:12,463 - retrieval - INFO - Skipping processing, using existing content.
2025-09-08 22:48:12,463 - retrieval - INFO - Extracting all the processed content.
2025-09-08 22:48:12,584 - retrieval - INFO - Combining all the processed content.
2025-09-08 22:48:12,584 - retrieval - INFO - Processed content ready.
2025-09-08 22:48:12,607 - retrieval - INFO - Saved processed report to ./processed_report/TT_VISION_HOLDINGS_BERHAD/TT_VISION_HOLDINGS_BERHAD_IPO.md


In [24]:
await manager.parse_report(
    company = "TT_VISION_HOLDINGS_BERHAD",
    report_type = ReportType.ANNUAL,
    year = 2024,
    #forced_process = True
)

2025-09-09 07:34:51,918 - retrieval - INFO - Already processed and up to date.
2025-09-09 07:34:51,918 - retrieval - INFO - Skipping processing, using existing content.
2025-09-09 07:34:51,919 - retrieval - INFO - Extracting all the processed content.
2025-09-09 07:34:52,033 - retrieval - INFO - Combining all the processed content.
2025-09-09 07:34:52,034 - retrieval - INFO - Processed content ready.
2025-09-09 07:34:52,058 - retrieval - INFO - Saved processed report to ./processed_report/TT_VISION_HOLDINGS_BERHAD/TT_VISION_HOLDINGS_BERHAD_ANNUAL_2024.md


In [23]:
await manager.parse_report(
    company = "TT_VISION_HOLDINGS_BERHAD",
    report_type = ReportType.ANNUAL,
    year = 2023,
    #forced_process = True
)

2025-09-09 07:34:49,195 - retrieval - INFO - Already processed and up to date.
2025-09-09 07:34:49,196 - retrieval - INFO - Skipping processing, using existing content.
2025-09-09 07:34:49,196 - retrieval - INFO - Extracting all the processed content.
2025-09-09 07:34:49,314 - retrieval - INFO - Combining all the processed content.
2025-09-09 07:34:49,314 - retrieval - INFO - Processed content ready.
2025-09-09 07:34:49,342 - retrieval - INFO - Saved processed report to ./processed_report/TT_VISION_HOLDINGS_BERHAD/TT_VISION_HOLDINGS_BERHAD_ANNUAL_2023.md


In [22]:
await manager.parse_report(
    company = "TT_VISION_HOLDINGS_BERHAD",
    report_type = ReportType.ANNUAL,
    year = 2022,
    #forced_process = True
)

2025-09-09 07:34:46,286 - retrieval - INFO - Already processed and up to date.
2025-09-09 07:34:46,287 - retrieval - INFO - Skipping processing, using existing content.
2025-09-09 07:34:46,287 - retrieval - INFO - Extracting all the processed content.
2025-09-09 07:34:46,400 - retrieval - INFO - Combining all the processed content.
2025-09-09 07:34:46,400 - retrieval - INFO - Processed content ready.
2025-09-09 07:34:46,424 - retrieval - INFO - Saved processed report to ./processed_report/TT_VISION_HOLDINGS_BERHAD/TT_VISION_HOLDINGS_BERHAD_ANNUAL_2022.md


# RAMSSOL GROUP BERHAD
- IPO
- ANNUAL 2024
- ANNUAL 2023
- ANNUAL 2022
- ANNUAL 2021

In [7]:
await manager.parse_report(
    company = "RAMSSOL_GROUP_BERHAD",
    report_type = ReportType.IPO,
    #forced_process = True
)

2025-09-08 22:48:10,277 - retrieval - INFO - Already processed and up to date.
2025-09-08 22:48:10,277 - retrieval - INFO - Skipping processing, using existing content.
2025-09-08 22:48:10,278 - retrieval - INFO - Extracting all the processed content.
2025-09-08 22:48:10,407 - retrieval - INFO - Combining all the processed content.
2025-09-08 22:48:10,408 - retrieval - INFO - Processed content ready.
2025-09-08 22:48:10,429 - retrieval - INFO - Saved processed report to ./processed_report/RAMSSOL_GROUP_BERHAD/RAMSSOL_GROUP_BERHAD_IPO.md


In [21]:
await manager.parse_report(
    company = "RAMSSOL_GROUP_BERHAD",
    report_type = ReportType.ANNUAL,
    year = 2024,
    #forced_process = True
)

2025-09-09 07:34:43,462 - retrieval - INFO - Already processed and up to date.
2025-09-09 07:34:43,462 - retrieval - INFO - Skipping processing, using existing content.
2025-09-09 07:34:43,463 - retrieval - INFO - Extracting all the processed content.
2025-09-09 07:34:43,587 - retrieval - INFO - Combining all the processed content.
2025-09-09 07:34:43,588 - retrieval - INFO - Processed content ready.
2025-09-09 07:34:43,612 - retrieval - INFO - Saved processed report to ./processed_report/RAMSSOL_GROUP_BERHAD/RAMSSOL_GROUP_BERHAD_ANNUAL_2024.md


In [20]:
await manager.parse_report(
    company = "RAMSSOL_GROUP_BERHAD",
    report_type = ReportType.ANNUAL,
    year = 2023,
    #forced_process = True
)

2025-09-09 07:34:41,618 - retrieval - INFO - Already processed and up to date.
2025-09-09 07:34:41,619 - retrieval - INFO - Skipping processing, using existing content.
2025-09-09 07:34:41,619 - retrieval - INFO - Extracting all the processed content.
2025-09-09 07:34:41,757 - retrieval - INFO - Combining all the processed content.
2025-09-09 07:34:41,757 - retrieval - INFO - Processed content ready.
2025-09-09 07:34:41,784 - retrieval - INFO - Saved processed report to ./processed_report/RAMSSOL_GROUP_BERHAD/RAMSSOL_GROUP_BERHAD_ANNUAL_2023.md


In [19]:
await manager.parse_report(
    company = "RAMSSOL_GROUP_BERHAD",
    report_type = ReportType.ANNUAL,
    year = 2022,
    #forced_process = True
)

2025-09-09 07:34:38,546 - retrieval - INFO - Already processed and up to date.
2025-09-09 07:34:38,547 - retrieval - INFO - Skipping processing, using existing content.
2025-09-09 07:34:38,547 - retrieval - INFO - Extracting all the processed content.
2025-09-09 07:34:38,665 - retrieval - INFO - Combining all the processed content.
2025-09-09 07:34:38,665 - retrieval - INFO - Processed content ready.
2025-09-09 07:34:38,689 - retrieval - INFO - Saved processed report to ./processed_report/RAMSSOL_GROUP_BERHAD/RAMSSOL_GROUP_BERHAD_ANNUAL_2022.md


In [18]:
await manager.parse_report(
    company = "RAMSSOL_GROUP_BERHAD",
    report_type = ReportType.ANNUAL,
    year = 2021,
    #forced_process = True
)

2025-09-09 07:34:31,675 - retrieval - INFO - Already processed and up to date.
2025-09-09 07:34:31,676 - retrieval - INFO - Skipping processing, using existing content.
2025-09-09 07:34:31,677 - retrieval - INFO - Extracting all the processed content.
2025-09-09 07:34:31,787 - retrieval - INFO - Combining all the processed content.
2025-09-09 07:34:31,788 - retrieval - INFO - Processed content ready.
2025-09-09 07:34:31,812 - retrieval - INFO - Saved processed report to ./processed_report/RAMSSOL_GROUP_BERHAD/RAMSSOL_GROUP_BERHAD_ANNUAL_2021.md


# ECA INTEGRATED SOLUTION BERHAD
- IPO
- ANNUAL 2024
- ANNUAL 2023
- ANNUAL 2022

In [6]:
await manager.parse_report(
    company = "ECA_INTEGRATED_SOLUTION_BERHAD",
    report_type = ReportType.IPO,
    #forced_process = True
)

2025-09-08 22:48:07,281 - retrieval - INFO - Already processed and up to date.
2025-09-08 22:48:07,282 - retrieval - INFO - Skipping processing, using existing content.
2025-09-08 22:48:07,284 - retrieval - INFO - Extracting all the processed content.
2025-09-08 22:48:07,411 - retrieval - INFO - Combining all the processed content.
2025-09-08 22:48:07,411 - retrieval - INFO - Processed content ready.
2025-09-08 22:48:07,433 - retrieval - INFO - Saved processed report to ./processed_report/ECA_INTEGRATED_SOLUTION_BERHAD/ECA_INTEGRATED_SOLUTION_BERHAD_IPO.md


In [17]:
await manager.parse_report(
    company = "ECA_INTEGRATED_SOLUTION_BERHAD",
    report_type = ReportType.ANNUAL,
    year = 2024,
    #forced_process = True
)

2025-09-09 07:34:21,868 - retrieval - INFO - Already processed and up to date.
2025-09-09 07:34:21,869 - retrieval - INFO - Skipping processing, using existing content.
2025-09-09 07:34:21,869 - retrieval - INFO - Extracting all the processed content.
2025-09-09 07:34:21,985 - retrieval - INFO - Combining all the processed content.
2025-09-09 07:34:21,985 - retrieval - INFO - Processed content ready.
2025-09-09 07:34:22,010 - retrieval - INFO - Saved processed report to ./processed_report/ECA_INTEGRATED_SOLUTION_BERHAD/ECA_INTEGRATED_SOLUTION_BERHAD_ANNUAL_2024.md


In [16]:
await manager.parse_report(
    company = "ECA_INTEGRATED_SOLUTION_BERHAD",
    report_type = ReportType.ANNUAL,
    year = 2023,
    #forced_process = True
)

2025-09-09 07:34:10,055 - retrieval - INFO - Already processed and up to date.
2025-09-09 07:34:10,056 - retrieval - INFO - Skipping processing, using existing content.
2025-09-09 07:34:10,057 - retrieval - INFO - Extracting all the processed content.
2025-09-09 07:34:10,171 - retrieval - INFO - Combining all the processed content.
2025-09-09 07:34:10,172 - retrieval - INFO - Processed content ready.
2025-09-09 07:34:10,197 - retrieval - INFO - Saved processed report to ./processed_report/ECA_INTEGRATED_SOLUTION_BERHAD/ECA_INTEGRATED_SOLUTION_BERHAD_ANNUAL_2023.md


In [15]:
await manager.parse_report(
    company = "ECA_INTEGRATED_SOLUTION_BERHAD",
    report_type = ReportType.ANNUAL,
    year = 2022,
    #forced_process = True
)

2025-09-09 07:33:55,045 - retrieval - INFO - Already processed and up to date.
2025-09-09 07:33:55,047 - retrieval - INFO - Skipping processing, using existing content.
2025-09-09 07:33:55,047 - retrieval - INFO - Extracting all the processed content.
2025-09-09 07:33:55,183 - retrieval - INFO - Combining all the processed content.
2025-09-09 07:33:55,183 - retrieval - INFO - Processed content ready.
2025-09-09 07:33:55,220 - retrieval - INFO - Saved processed report to ./processed_report/ECA_INTEGRATED_SOLUTION_BERHAD/ECA_INTEGRATED_SOLUTION_BERHAD_ANNUAL_2022.md


# AUTOCOUNT DOTCOM BERHAD
- IPO
- ANNUAL 2024
- ANNUAL 2023

In [2]:
await manager.parse_report(
    company = "AUTOCOUNT_DOTCOM_BERHAD",
    report_type = ReportType.IPO,
    #forced_process = True
)

2025-09-10 17:29:24,537 - retrieval - INFO - Already processed and up to date.
2025-09-10 17:29:24,539 - retrieval - INFO - Skipping processing, using existing content.
2025-09-10 17:29:24,540 - retrieval - INFO - Extracting all the processed content.
2025-09-10 17:29:25,279 - retrieval - INFO - Combining all the processed content.
2025-09-10 17:29:25,280 - retrieval - INFO - Processed content ready.
2025-09-10 17:29:25,350 - retrieval - INFO - Saved processed report to ./processed_report/AUTOCOUNT_DOTCOM_BERHAD/AUTOCOUNT_DOTCOM_BERHAD_IPO.md


In [3]:
await manager.parse_report(
    company = "AUTOCOUNT_DOTCOM_BERHAD",
    report_type = ReportType.ANNUAL,
    year = 2024,
    #forced_process = True
)

2025-09-10 17:29:27,532 - retrieval - INFO - Already processed and up to date.
2025-09-10 17:29:27,534 - retrieval - INFO - Skipping processing, using existing content.
2025-09-10 17:29:27,536 - retrieval - INFO - Extracting all the processed content.
2025-09-10 17:29:27,854 - retrieval - INFO - Combining all the processed content.
2025-09-10 17:29:27,855 - retrieval - INFO - Processed content ready.
2025-09-10 17:29:27,921 - retrieval - INFO - Saved processed report to ./processed_report/AUTOCOUNT_DOTCOM_BERHAD/AUTOCOUNT_DOTCOM_BERHAD_ANNUAL_2024.md


In [4]:
await manager.parse_report(
    company = "AUTOCOUNT_DOTCOM_BERHAD",
    report_type = ReportType.ANNUAL,
    year = 2023,
    #forced_process = True
)

2025-09-10 17:29:29,407 - retrieval - INFO - Already processed and up to date.
2025-09-10 17:29:29,407 - retrieval - INFO - Skipping processing, using existing content.
2025-09-10 17:29:29,408 - retrieval - INFO - Extracting all the processed content.
2025-09-10 17:29:29,748 - retrieval - INFO - Combining all the processed content.
2025-09-10 17:29:29,748 - retrieval - INFO - Processed content ready.
2025-09-10 17:29:29,803 - retrieval - INFO - Saved processed report to ./processed_report/AUTOCOUNT_DOTCOM_BERHAD/AUTOCOUNT_DOTCOM_BERHAD_ANNUAL_2023.md


# EDELTEQ HOLDINGS BERHAD
- IPO
- ANNUAL 2024
- ANNUAL 2023

In [5]:
await manager.parse_report(
    company = "EDELTEQ HOLDINGS BERHAD",
    report_type = ReportType.IPO,
    #forced_process = True
)

2025-09-07 22:12:09,117 - retrieval - INFO - Already processed and up to date.
2025-09-07 22:12:09,117 - retrieval - INFO - Skipping processing, using existing content.
2025-09-07 22:12:09,118 - retrieval - INFO - Extracting all the processed content.
2025-09-07 22:12:09,259 - retrieval - INFO - Combining all the processed content.
2025-09-07 22:12:09,259 - retrieval - INFO - Processed content ready.
2025-09-07 22:12:09,282 - retrieval - INFO - Saved processed report to ./processed_report/EDELTEQ_HOLDINGS_BERHAD/EDELTEQ_HOLDINGS_BERHAD_IPO.md


In [6]:
await manager.parse_report(
    company = "EDELTEQ HOLDINGS BERHAD",
    report_type = ReportType.ANNUAL,
    year = 2024,
    #forced_process = True
)

2025-09-07 22:12:11,139 - retrieval - INFO - Already processed and up to date.
2025-09-07 22:12:11,139 - retrieval - INFO - Skipping processing, using existing content.
2025-09-07 22:12:11,140 - retrieval - INFO - Extracting all the processed content.
2025-09-07 22:12:11,257 - retrieval - INFO - Combining all the processed content.
2025-09-07 22:12:11,257 - retrieval - INFO - Processed content ready.
2025-09-07 22:12:11,282 - retrieval - INFO - Saved processed report to ./processed_report/EDELTEQ_HOLDINGS_BERHAD/EDELTEQ_HOLDINGS_BERHAD_ANNUAL_2024.md


In [20]:
await manager.parse_report(
    company = "EDELTEQ HOLDINGS BERHAD",
    report_type = ReportType.ANNUAL,
    year = 2023,
    #forced_process = True
)

2025-09-08 03:51:03,349 - retrieval - INFO - Already processed and up to date.
2025-09-08 03:51:03,350 - retrieval - INFO - Skipping processing, using existing content.
2025-09-08 03:51:03,350 - retrieval - INFO - Extracting all the processed content.
2025-09-08 03:51:03,529 - retrieval - INFO - Combining all the processed content.
2025-09-08 03:51:03,529 - retrieval - INFO - Processed content ready.
2025-09-08 03:51:03,553 - retrieval - INFO - Saved processed report to ./processed_report/EDELTEQ_HOLDINGS_BERHAD/EDELTEQ_HOLDINGS_BERHAD_ANNUAL_2023.md


# VETECE HOLDINGS BERHAD
- IPO
- ANNUAL 2024

In [19]:
await manager.parse_report(
    company = "VETECE_HOLDINGS_BERHAD",
    report_type = ReportType.IPO,
    #forced_process = True
)

2025-09-08 03:50:23,624 - retrieval - INFO - Already processed and up to date.
2025-09-08 03:50:23,625 - retrieval - INFO - Skipping processing, using existing content.
2025-09-08 03:50:23,625 - retrieval - INFO - Extracting all the processed content.
2025-09-08 03:50:23,734 - retrieval - INFO - Combining all the processed content.
2025-09-08 03:50:23,734 - retrieval - INFO - Processed content ready.
2025-09-08 03:50:23,756 - retrieval - INFO - Saved processed report to ./processed_report/VETECE_HOLDINGS_BERHAD/VETECE_HOLDINGS_BERHAD_IPO.md


In [3]:
await manager.parse_report(
    company = "VETECE_HOLDINGS_BERHAD",
    report_type = ReportType.ANNUAL,
    year = 2024,
    #forced_process = True
)

2025-09-08 04:11:51,671 - retrieval - INFO - Already processed and up to date.
2025-09-08 04:11:51,674 - retrieval - INFO - Skipping processing, using existing content.
2025-09-08 04:11:51,674 - retrieval - INFO - Extracting all the processed content.
2025-09-08 04:11:52,217 - retrieval - INFO - Combining all the processed content.
2025-09-08 04:11:52,218 - retrieval - INFO - Processed content ready.
2025-09-08 04:11:52,295 - retrieval - INFO - Saved processed report to ./processed_report/VETECE_HOLDINGS_BERHAD/VETECE_HOLDINGS_BERHAD_ANNUAL_2024.md


# CABNET HOLDINGS BERHAD
- IPO
- ANNUAL 2025
- ANNUAL 2024
- ANNUAL 2023
- ANNUAL 2021
- ANNUAL 2020
- ANNUAL 2019
- ANNUAL 2018
- ANNUAL 2017


In [10]:
await manager.parse_report(
    company = "CABNET_HOLDINGS_BERHAD",
    report_type = ReportType.IPO,
    #forced_process = True
)

2025-09-07 22:17:56,691 - retrieval - INFO - Already processed and up to date.
2025-09-07 22:17:56,691 - retrieval - INFO - Skipping processing, using existing content.
2025-09-07 22:17:56,692 - retrieval - INFO - Extracting all the processed content.
2025-09-07 22:17:56,738 - retrieval - INFO - Combining all the processed content.
2025-09-07 22:17:56,739 - retrieval - INFO - Processed content ready.
2025-09-07 22:17:56,761 - retrieval - INFO - Saved processed report to ./processed_report/CABNET_HOLDINGS_BERHAD/CABNET_HOLDINGS_BERHAD_IPO.md


In [17]:
await manager.parse_report(
    company = "CABNET_HOLDINGS_BERHAD",
    report_type = ReportType.ANNUAL,
    year = 2024,
    #forced_process = True
)

2025-09-08 03:49:54,795 - retrieval - INFO - Already processed and up to date.
2025-09-08 03:49:54,796 - retrieval - INFO - Skipping processing, using existing content.
2025-09-08 03:49:54,796 - retrieval - INFO - Extracting all the processed content.
2025-09-08 03:49:54,900 - retrieval - INFO - Combining all the processed content.
2025-09-08 03:49:54,900 - retrieval - INFO - Processed content ready.
2025-09-08 03:49:54,923 - retrieval - INFO - Saved processed report to ./processed_report/CABNET_HOLDINGS_BERHAD/CABNET_HOLDINGS_BERHAD_ANNUAL_2024.md


In [5]:
await manager.parse_report(
    company = "CABNET_HOLDINGS_BERHAD",
    report_type = ReportType.ANNUAL,
    year = 2023,
    #forced_process = True
)

2025-09-08 04:19:42,425 - retrieval - INFO - Already processed and up to date.
2025-09-08 04:19:42,426 - retrieval - INFO - Skipping processing, using existing content.
2025-09-08 04:19:42,426 - retrieval - INFO - Extracting all the processed content.
2025-09-08 04:19:42,755 - retrieval - INFO - Combining all the processed content.
2025-09-08 04:19:42,756 - retrieval - INFO - Processed content ready.
2025-09-08 04:19:42,882 - retrieval - INFO - Saved processed report to ./processed_report/CABNET_HOLDINGS_BERHAD/CABNET_HOLDINGS_BERHAD_ANNUAL_2023.md


In [15]:
await manager.parse_report(
    company = "CABNET_HOLDINGS_BERHAD",
    report_type = ReportType.ANNUAL,
    year = 2025,
    #forced_process = True
)

2025-09-08 03:49:25,791 - retrieval - INFO - Already processed and up to date.
2025-09-08 03:49:25,793 - retrieval - INFO - Skipping processing, using existing content.
2025-09-08 03:49:25,793 - retrieval - INFO - Extracting all the processed content.
2025-09-08 03:49:25,916 - retrieval - INFO - Combining all the processed content.
2025-09-08 03:49:25,917 - retrieval - INFO - Processed content ready.
2025-09-08 03:49:25,951 - retrieval - INFO - Saved processed report to ./processed_report/CABNET_HOLDINGS_BERHAD/CABNET_HOLDINGS_BERHAD_ANNUAL_2025.md


In [4]:
await manager.parse_report(
    company = "CABNET_HOLDINGS_BERHAD",
    report_type = ReportType.ANNUAL,
    year = 2021,
    #forced_process = True
)

2025-09-08 16:40:25,303 - retrieval - INFO - Already processed and up to date.
2025-09-08 16:40:25,304 - retrieval - INFO - Skipping processing, using existing content.
2025-09-08 16:40:25,304 - retrieval - INFO - Extracting all the processed content.
2025-09-08 16:40:25,401 - retrieval - INFO - Combining all the processed content.
2025-09-08 16:40:25,402 - retrieval - INFO - Processed content ready.
2025-09-08 16:40:25,425 - retrieval - INFO - Saved processed report to ./processed_report/CABNET_HOLDINGS_BERHAD/CABNET_HOLDINGS_BERHAD_ANNUAL_2021.md


In [13]:
await manager.parse_report(
    company = "CABNET_HOLDINGS_BERHAD",
    report_type = ReportType.ANNUAL,
    year = 2020,
    #forced_process = True
)

2025-09-08 17:07:54,792 - retrieval - INFO - Already processed and up to date.
2025-09-08 17:07:54,793 - retrieval - INFO - Skipping processing, using existing content.
2025-09-08 17:07:54,793 - retrieval - INFO - Extracting all the processed content.
2025-09-08 17:07:54,886 - retrieval - INFO - Combining all the processed content.
2025-09-08 17:07:54,886 - retrieval - INFO - Processed content ready.
2025-09-08 17:07:54,910 - retrieval - INFO - Saved processed report to ./processed_report/CABNET_HOLDINGS_BERHAD/CABNET_HOLDINGS_BERHAD_ANNUAL_2020.md


In [12]:
await manager.parse_report(
    company = "CABNET_HOLDINGS_BERHAD",
    report_type = ReportType.ANNUAL,
    year = 2019,
    #forced_process = True
)

2025-09-08 17:07:53,224 - retrieval - INFO - Already processed and up to date.
2025-09-08 17:07:53,225 - retrieval - INFO - Skipping processing, using existing content.
2025-09-08 17:07:53,225 - retrieval - INFO - Extracting all the processed content.
2025-09-08 17:07:53,339 - retrieval - INFO - Combining all the processed content.
2025-09-08 17:07:53,340 - retrieval - INFO - Processed content ready.
2025-09-08 17:07:53,373 - retrieval - INFO - Saved processed report to ./processed_report/CABNET_HOLDINGS_BERHAD/CABNET_HOLDINGS_BERHAD_ANNUAL_2019.md


In [11]:
await manager.parse_report(
    company = "CABNET_HOLDINGS_BERHAD",
    report_type = ReportType.ANNUAL,
    year = 2018,
    #forced_process = True
)

2025-09-08 17:07:50,751 - retrieval - INFO - Already processed and up to date.
2025-09-08 17:07:50,752 - retrieval - INFO - Skipping processing, using existing content.
2025-09-08 17:07:50,753 - retrieval - INFO - Extracting all the processed content.
2025-09-08 17:07:50,867 - retrieval - INFO - Combining all the processed content.
2025-09-08 17:07:50,867 - retrieval - INFO - Processed content ready.
2025-09-08 17:07:50,890 - retrieval - INFO - Saved processed report to ./processed_report/CABNET_HOLDINGS_BERHAD/CABNET_HOLDINGS_BERHAD_ANNUAL_2018.md


In [10]:
await manager.parse_report(
    company = "CABNET_HOLDINGS_BERHAD",
    report_type = ReportType.ANNUAL,
    year = 2017,
    #forced_process = True
)

2025-09-08 17:07:48,602 - retrieval - INFO - Already processed and up to date.
2025-09-08 17:07:48,603 - retrieval - INFO - Skipping processing, using existing content.
2025-09-08 17:07:48,604 - retrieval - INFO - Extracting all the processed content.
2025-09-08 17:07:48,711 - retrieval - INFO - Combining all the processed content.
2025-09-08 17:07:48,712 - retrieval - INFO - Processed content ready.
2025-09-08 17:07:48,735 - retrieval - INFO - Saved processed report to ./processed_report/CABNET_HOLDINGS_BERHAD/CABNET_HOLDINGS_BERHAD_ANNUAL_2017.md


# ICT ZONE ASIA BERHAD
- IPO

In [14]:
await manager.parse_report(
    company = "ICT ZONE ASIA BERHAD",
    report_type = ReportType.IPO,
    #forced_process = True
)

2025-09-07 22:38:10,203 - retrieval - INFO - Already processed and up to date.
2025-09-07 22:38:10,203 - retrieval - INFO - Skipping processing, using existing content.
2025-09-07 22:38:10,204 - retrieval - INFO - Extracting all the processed content.
2025-09-07 22:38:10,342 - retrieval - INFO - Combining all the processed content.
2025-09-07 22:38:10,343 - retrieval - INFO - Processed content ready.
2025-09-07 22:38:10,366 - retrieval - INFO - Saved processed report to ./processed_report/ICT_ZONE_ASIA_BERHAD/ICT_ZONE_ASIA_BERHAD_IPO.md


---

### TESTING

In [None]:
companies_1 = [
    "EDELTEQ_HOLDINGS_BERHAD",
    "AUTOCOUNT_DOTCOM_BERHAD",
    "ICT_ZONE_ASIA_BERHAD",
    "AEMULUS_HOLDINGS_BERHAD",
    "CABNET_HOLDINGS_BERHAD",
    "VETECE_HOLDINGS_BERHAD",
    "I-STONE_GROUP_BERHAD",
    "CENTURY_SOFTWARE_HOLDINGS_BERHAD",
    "EDARAN_DIGITAL_SYSTEMS_BERHAD",
    "ViTrox Corporation Berhad"
]
company_name = "CABNET_HOLDINGS_BERHAD"
years = [2025, 2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017]

success = []
failed  = []

async def run_all():
    for year in years:
        for attempt in (1, 2):  # tiny retry (2 attempts total)
            try:
                print(f"▶️  [{company_name}] start (attempt {attempt})")
                await manager.parse_report(
                    company = company_name,
                    report_type = ReportType.ANNUAL,
                    year=year,
                    #forced_process=True,
                )
                print(f"✅  [{company_name}] done")
                success.append(company_name)
                break
            except Exception as e:
                print(f"⚠️  [{company_name}] attempt {attempt} failed: {e}")
                if attempt == 2:
                    failed.append((company_name, str(e)))

    print("\n===== SUMMARY =====")
    print(f"✔️  Success: {len(success)} -> {success}")
    print(f"❌ Failed : {len(failed)}")
    for name, err in failed:
        print(f"   - {name}: {err}")

await run_all()

▶️  [CABNET_HOLDINGS_BERHAD] start (attempt 1)


2025-09-08 16:32:21,825 - retrieval - INFO - Already processed and up to date.
2025-09-08 16:32:21,826 - retrieval - INFO - Skipping processing, using existing content.
2025-09-08 16:32:21,827 - retrieval - INFO - Extracting all the processed content.
2025-09-08 16:32:21,926 - retrieval - INFO - Combining all the processed content.
2025-09-08 16:32:21,927 - retrieval - INFO - Processed content ready.
2025-09-08 16:32:21,960 - retrieval - INFO - Saved processed report to ./processed_report/CABNET_HOLDINGS_BERHAD/CABNET_HOLDINGS_BERHAD_ANNUAL_2025.md
2025-09-08 16:32:21,978 - retrieval - INFO - Already processed and up to date.
2025-09-08 16:32:21,979 - retrieval - INFO - Skipping processing, using existing content.
2025-09-08 16:32:21,979 - retrieval - INFO - Extracting all the processed content.
2025-09-08 16:32:22,005 - retrieval - INFO - Combining all the processed content.
2025-09-08 16:32:22,005 - retrieval - INFO - Processed content ready.
2025-09-08 16:32:22,025 - retrieval - INF

✅  [CABNET_HOLDINGS_BERHAD] done
▶️  [CABNET_HOLDINGS_BERHAD] start (attempt 1)
✅  [CABNET_HOLDINGS_BERHAD] done
▶️  [CABNET_HOLDINGS_BERHAD] start (attempt 1)
✅  [CABNET_HOLDINGS_BERHAD] done
▶️  [CABNET_HOLDINGS_BERHAD] start (attempt 1)
⚠️  [CABNET_HOLDINGS_BERHAD] attempt 1 failed: No raw reports found for CABNET_HOLDINGS_BERHAD Annual Report & CG Report 2022
▶️  [CABNET_HOLDINGS_BERHAD] start (attempt 2)
⚠️  [CABNET_HOLDINGS_BERHAD] attempt 2 failed: No raw reports found for CABNET_HOLDINGS_BERHAD Annual Report & CG Report 2022
▶️  [CABNET_HOLDINGS_BERHAD] start (attempt 1)


2025-09-08 16:32:22,267 - retrieval - INFO -      Uploading CG_Report_Template_2021.pdf ...
2025-09-08 16:32:22,324 - retrieval - INFO -      Uploading AGM_Admin_Guide_FINAL.pdf ...
2025-09-08 16:32:29,143 - retrieval - INFO - Uploaded 3 PDFs
2025-09-08 16:32:29,145 - retrieval - INFO - Fresh processing mode, extracting TOC.
2025-09-08 16:33:04,985 - retrieval - INFO - Definition tokens: prompt = 55191, output = 222, total = 56415
2025-09-08 16:33:04,986 - retrieval - INFO - Table of Contents extracted
2025-09-08 16:33:05,011 - retrieval - INFO - Saved Table of Contents for CABNET_HOLDINGS_BERHAD
2025-09-08 16:33:05,032 - retrieval - INFO - Extracting section: 1. CORPORATE INFORMATION
2025-09-08 16:33:05,052 - retrieval - INFO - Extracting section: 4. PROFILE OF DIRECTORS
2025-09-08 16:33:05,071 - retrieval - INFO - Extracting section: 5. PROFILE OF KEY SENIOR MANAGEMENT
2025-09-08 16:33:05,091 - retrieval - INFO - Extracting section: 6. MANAGEMENT DISCUSSION AND ANALYSIS
2025-09-08 16

✅  [CABNET_HOLDINGS_BERHAD] done
▶️  [CABNET_HOLDINGS_BERHAD] start (attempt 1)


2025-09-08 16:39:24,000 - retrieval - INFO -      Uploading CabNet_CG_Report_2020_-_Final.pdf ...
2025-09-08 16:39:24,056 - retrieval - INFO -      Uploading Administrative_guide.pdf ...


In [None]:
await manager.parse_report(
    company = "AEMULUS_HOLDINGS_BERHAD",
    report_type = ReportType.IPO,
    forced_process = True
)

#tasks = [asyncio.create_task(process_one(i+1, sections[i])) for i in [2, 7, 12, 15]] if sections else []

In [None]:
await manager.parse_report(
    company = "I-STONE_GROUP_BERHAD",
    report_type = ReportType.IPO,
    forced_process = True
)

#tasks = [asyncio.create_task(process_one(i+1, sections[i])) for i in [4, 8, 10]] if sections else []

2025-09-08 15:36:14,540 - retrieval - INFO - Fresh processing of all docs.
2025-09-08 15:36:14,540 - retrieval - INFO -      Uploading i-Stone_-_Prospectus_210619_(Part_1).pdf ...
2025-09-08 15:36:14,829 - retrieval - INFO -      Uploading i-Stone_-_Prospectus_210619_(Part_2).pdf ...
2025-09-08 15:36:14,936 - retrieval - INFO -      Uploading i-Stone_-_Detailed_Procedures_for_E-IPO_Share_Application_and_Acceptance.pdf ...
2025-09-08 15:36:21,599 - retrieval - INFO - Uploaded 3 PDFs
2025-09-08 15:36:21,602 - retrieval - INFO - Fresh processing mode, extracting definitions and TOC.
2025-09-08 15:36:21,623 - retrieval - INFO - Definitions already exist, skipping extraction.
2025-09-08 15:36:21,644 - retrieval - INFO - Table of Contents already exists, skipping extraction.
2025-09-08 15:36:21,663 - retrieval - INFO - Sections to extract: ['1. CORPORATE DIRECTORY', '2. PROSPECTUS SUMMARY', '3. APPROVALS AND CONDITIONS', '4. PARTICULARS OF OUR IPO', '5. INFORMATION ON PROMOTERS, SUBSTANTIAL 

In [None]:
await manager.parse_report(
    company = "CENTURY_SOFTWARE_HOLDINGS_BERHAD",
    report_type = ReportType.IPO,
    forced_process = True
)
#tasks = [asyncio.create_task(process_one(15, sections[14]))] if sections else []

2025-09-08 10:39:11,345 - retrieval - INFO - Fresh processing of all docs.
2025-09-08 10:39:11,346 - retrieval - INFO -      Uploading CENSOF-NoticeOfProspectus_(11KB).pdf ...
2025-09-08 10:39:11,396 - retrieval - INFO -      Uploading CENSOF-Cover_to_Page_200_(2.7MB).pdf ...
2025-09-08 10:39:11,621 - retrieval - INFO -      Uploading CENSOF-Page_201_to_Page_416_(2.2MB).pdf ...
2025-09-08 10:39:18,811 - retrieval - INFO - Uploaded 3 PDFs
2025-09-08 10:39:18,813 - retrieval - INFO - Fresh processing mode, extracting definitions and TOC.
2025-09-08 10:39:18,839 - retrieval - INFO - Definitions already exist, skipping extraction.
2025-09-08 10:39:18,865 - retrieval - INFO - Table of Contents already exists, skipping extraction.
2025-09-08 10:39:18,893 - retrieval - INFO - Sections to extract: ['1. CORPORATE DIRECTORY', '2. SUMMARY INFORMATION', '3. PARTICULARS OF THE IPO', '4. RISK FACTORS', '5. INFORMATION ON OUR GROUP', '6. INDUSTRY OVERVIEW', '7. HISTORICAL PROFORMA FINANCIAL INFORMATI

In [None]:
await manager.parse_report(
    company = "AEMULUS_HOLDINGS_BERHAD",
    report_type = ReportType.ANNUAL,
    year = 2024,
    #forced_process = True
)

2025-09-04 12:52:10,644 - retrieval - INFO - No raw reports found for AEMULUS_HOLDINGS_BERHAD Annual Report & CG Report 2024


ValueError: No raw reports found for AEMULUS_HOLDINGS_BERHAD Annual Report & CG Report 2024

In [None]:
await manager.parse_report(
    company = "VITROX_CORPORATION_BERHAD",
    report_type = ReportType.ANNUAL,
    year = 2024,
    #forced_process = True
)

2025-08-30 00:02:53,692 - retrieval - INFO - Already processed and up to date.
2025-08-30 00:02:53,693 - retrieval - INFO - Skipping processing, using existing content.
2025-08-30 00:02:53,697 - retrieval - INFO - Extracting all the processed content.
2025-08-30 00:02:53,841 - retrieval - INFO - Combining all the processed content.
2025-08-30 00:02:53,841 - retrieval - INFO - Processed content ready.
2025-08-30 00:02:53,874 - retrieval - INFO - Saved processed report to ./processed_report/VITROX_CORPORATION_BERHAD/VITROX_CORPORATION_BERHAD_ANNUAL_2024.md


In [None]:
await manager.parse_report(
    company = "FARM_FRESH_BERHAD",
    report_type = ReportType.ANNUAL,
    year = 2024,
    #forced_process = True
)

2025-08-29 23:17:01,701 - retrieval - INFO - Fresh processing of all docs.
2025-08-29 23:17:01,702 - retrieval - INFO -      Uploading Farm_Fresh_Berhad_-_Integrated_Annual_Report_2024_(Part_1).pdf ...
2025-08-29 23:17:01,919 - retrieval - INFO -      Uploading Farm_Fresh_Berhad_-_Integrated_Annual_Report_2024_(Part_2).pdf ...
2025-08-29 23:17:02,046 - retrieval - INFO -      Uploading Farm_Fresh_Berhad_-_Integrated_Annual_Report_2024_(Part_3).pdf ...
2025-08-29 23:17:02,193 - retrieval - INFO -      Uploading Farm_Fresh_Berhad_-_Integrated_Annual_Report_2024_(Part_4).pdf ...
2025-08-29 23:17:02,314 - retrieval - INFO -      Uploading Farm_Fresh_Berhad_-_CG_Report_2024.pdf ...
2025-08-29 23:17:13,266 - retrieval - INFO - Uploaded 5 PDFs
2025-08-29 23:17:13,268 - retrieval - INFO - Fresh processing mode, extracting TOC.
2025-08-29 23:17:13,298 - retrieval - INFO - Table of Contents already exists, skipping extraction.
2025-08-29 23:17:13,319 - retrieval - INFO - Sections to extract: ['1