# Pipeline of Graph Construction


## Initialization


In [1]:
import logging
import os
import nest_asyncio
from dotenv import load_dotenv

In [2]:
from ogmyrag.my_logging import configure_logger
from ogmyrag.base import MongoStorageConfig, Neo4jStorageConfig, PineconeStorageConfig
from ogmyrag.graph_construction import GraphConstructionSystem
from motor.motor_asyncio import AsyncIOMotorClient

In [3]:
# Patch event loop to support re-entry in Jupyter

nest_asyncio.apply()

In [4]:
# Setup logging
graph_construction_logger = configure_logger(
    name="graph_construction",
    log_level=logging.DEBUG,
    log_file="logs/graph_construction.log",
)

openai_logger = configure_logger(
    name="openai", log_level=logging.DEBUG, log_file="logs/openai.log", to_console=False
)
mongo_logger = configure_logger(
    name="mongodb",
    log_level=logging.DEBUG,
    log_file="logs/mongodb.log",
    to_console=False,
)
pinecone_logger = configure_logger(
    name="pinecone",
    log_level=logging.DEBUG,
    log_file="logs/pinecone.log",
    to_console=False,
)
neo4j_logger = configure_logger(
    name="neo4j", log_level=logging.DEBUG, log_file="logs/neo4j.log", to_console=False
)

In [5]:
# Load environment variables

load_dotenv(override=True)

mongo_db_uri = os.getenv("MONGO_DB_URI", "")

openai_api_key = os.getenv("OPENAI_API_KEY", "")

pinecone_entities_api_key = os.getenv("PINECONE_ENTITIES_API_KEY", "")
pinecone_entities_environment = os.getenv("PINECONE_ENTITIES_ENVIRONMENT", "")
pinecone_entities_cloud = os.getenv("PINECONE_ENTITIES_CLOUD", "")
pinecone_entities_metric = os.getenv("PINECONE_ENTITIES_METRIC", "")
pinecone_entities_dimensions = os.getenv("PINECONE_ENTITIES_DIMENSIONS")

pinecone_cache_api_key = os.getenv("PINECONE_CACHE_API_KEY", "")
pinecone_cache_environment = os.getenv("PINECONE_CACHE_ENVIRONMENT", "")
pinecone_cache_cloud = os.getenv("PINECONE_CACHE_CLOUD", "")
pinecone_cache_metric = os.getenv("PINECONE_CACHE_METRIC", "")
pinecone_cache_dimensions = os.getenv("PINECONE_CACHE_DIMENSIONS")

neo4j_uri = os.getenv("NEO4J_URI", "")
neo4j_username = os.getenv("NEO4J_USERNAME", "")
neo4j_password = os.getenv("NEO4J_PASSWORD", "")

if not mongo_db_uri:
    graph_construction_logger.error("Please set the MONGO_DB_URI environment variable.")

if not openai_api_key:
    graph_construction_logger.error(
        "Please set the OPENAI_API_KEY environment variable."
    )

if (
    not pinecone_entities_api_key
    or not pinecone_entities_environment
    or not pinecone_entities_cloud
    or not pinecone_entities_metric
    or not pinecone_entities_dimensions
):
    graph_construction_logger.error(
        "Please set the PINECONE_ENTITIES_API_KEY, PINECONE_ENTITIES_ENVIRONMENT, PINECONE_ENTITIES_CLOUD, PINECONE_ENTITIES_METRIC, and PINECONE_ENTITIES_DIMENSIONS environment variables."
    )
    
if (
    not pinecone_cache_api_key
    or not pinecone_cache_environment
    or not pinecone_cache_cloud
    or not pinecone_cache_metric
    or not pinecone_cache_dimensions
):
    graph_construction_logger.error(        
        "Please set the PINECONE_CACHE_API_KEY, PINECONE_CACHE_ENVIRONMENT, PINECONE_CACHE_CLOUD, PINECONE_ENTITIES_METRIC, and PINECONE_CACHE_DIMENSIONS environment variables."
    )

if not neo4j_uri or not neo4j_username or not neo4j_password:
    graph_construction_logger.error(
        "Please set the NE04J_URI, NE04J_USERNAME, and NE04J_PASSWORD environment variables."
    )

## Construction Pipeline


### Initialize Graph Construction System


In [6]:
ontology_config: MongoStorageConfig = {
    "database_name": "ogmyrag",
    "collection_name": "ontology_v2",
}

entity_config: MongoStorageConfig = {
    "database_name": "ogmyrag",
    "collection_name": "entities_test",
}

relationship_config: MongoStorageConfig = {
    "database_name": "ogmyrag",
    "collection_name": "relationships_test",
}

disclosure_config: MongoStorageConfig = {
    "database_name": "ogmyrag",
    "collection_name": "company_disclosures",
}

entities_cache_config: MongoStorageConfig = {
    "database_name": "ogmyrag_entities_cache",
}

relationships_cache_config: MongoStorageConfig={
    "database_name": "ogmyrag_relationships_cache",
}

entities_deduplication_pending_tasks_config: MongoStorageConfig = {
    "database_name": "ogmyrag",
    "collection_name": "entities_deduplication_pending_tasks",
}

relationships_deduplication_pending_tasks_config: MongoStorageConfig = {
    "database_name": "ogmyrag",
    "collection_name": "relationships_deduplication_pending_tasks",
}

graphdb_config: Neo4jStorageConfig = {
    "uri": neo4j_uri,
    "user": neo4j_username,
    "password": neo4j_password,
}

entity_vector_config: PineconeStorageConfig = {
    "index_name": "ogmyrag",
    "pinecone_api_key": pinecone_entities_api_key,
    "pinecone_environment": pinecone_entities_environment,
    "pinecone_cloud": pinecone_entities_cloud,
    "pinecone_metric": pinecone_entities_metric,
    "pinecone_dimensions": pinecone_entities_dimensions,
    "openai_api_key": openai_api_key,
}

entity_cache_vector_config: PineconeStorageConfig = {
    "index_name": "ogmyrag-entities-cache",
    "pinecone_api_key": pinecone_cache_api_key,
    "pinecone_environment": pinecone_cache_environment,
    "pinecone_cloud": pinecone_cache_cloud,
    "pinecone_metric": pinecone_cache_metric,
    "pinecone_dimensions": pinecone_cache_dimensions,
    "openai_api_key": openai_api_key,
}

In [7]:
async_mongo_client = AsyncIOMotorClient(
    mongo_db_uri,
    serverSelectionTimeoutMS=5000,
)

try:
    graph_system = GraphConstructionSystem(
        async_mongo_client=async_mongo_client,
        ontology_config=ontology_config,
        disclosure_config=disclosure_config,
        entity_config=entity_config,
        relationship_config=relationship_config,
        entity_vector_config=entity_vector_config,
        graphdb_config=graphdb_config,
        entity_cache_config=entities_cache_config,
        relationship_cache_config=relationships_cache_config,
        entity_cache_vector_config=entity_cache_vector_config,
        entities_deduplication_pending_tasks_config=entities_deduplication_pending_tasks_config,
    )
except Exception as e:
    graph_construction_logger.error(
        f"GraphConstructionSystem\nError while creating graph construction system: {e}"
    )

### Process unparsed documents


In [None]:
try:
    await graph_system.extract_entities_relationships_from_unparsed_documents(
        from_company="AUTOCOUNT_DOTCOM_BERHAD",
        document_type="PROSPECTUS",
        published_at="14-April-2023",
        num_of_relationships_per_onto=6,
        exclude_documents=[
            # "ADB_PROSPECTUS_SECTION_1",
            # "ADB_PROSPECTUS_SECTION_2",
            # "ADB_PROSPECTUS_SECTION_3",
            # "ADB_PROSPECTUS_SECTION_4",
            # "ADB_PROSPECTUS_SECTION_5",
            "ADB_PROSPECTUS_SECTION_6",
            "ADB_PROSPECTUS_SECTION_7A",
            "ADB_PROSPECTUS_SECTION_7B",
            "ADB_PROSPECTUS_SECTION_7C",
            "ADB_PROSPECTUS_SECTION_8",
            "ADB_PROSPECTUS_SECTION_9A",
            "ADB_PROSPECTUS_SECTION_9B",
            "ADB_PROSPECTUS_SECTION_10",
        ],
    )
except Exception as e:
    graph_construction_logger.error(
        f"GraphConstructionSystem\nError while processing entities and relationships from unparsed documents:\n {e}"
    )

In [None]:
try:
    await graph_system.deduplicate_entities(
        from_company="AUTOCOUNT_DOTCOM_BERHAD",
        num_of_entities_per_batch=10,
        max_cache_size=1000,
        similarity_threshold=0.6,
        num_of_relationships_to_fetch=10,
        max_wait_time_per_task=5
    )
except Exception as e:
    graph_construction_logger.error(
        f"GraphConstructionSystem\nError while deduplicating entities:\n {e}"
    )
    

In [15]:
try:
    await graph_system.deduplicate_relationships(
        from_company="AUTOCOUNT_DOTCOM_BERHAD",
        num_of_relationships_per_batch=5,
        max_cache_size=1000,
        max_wait_time_per_task=5
    )
except Exception as e:
    graph_construction_logger.error(
        f"GraphConstructionSystem\nError while deduplicating relationships:\n {e}"
    )

2025-08-20 19:52:49,231 - graph_construction - DEBUG - GraphConstructionSystem
Relationships to deduplicate:
[
    {
        "_id": "68a5acf2d3d7d9a2bb587492",
        "source_id": "68a5acf2d3d7d9a2bb58748f",
        "target_id": "68a5acf2d3d7d9a2bb587490",
        "type": "listedOn",
        "description": [
            "As of January 2023, Autocount Dotcom Berhad obtained approval from Bursa Malaysia Securities Berhad to list and quote its entire enlarged issued share capital on the ACE Market of Bursa Securities, indicating its planned admission in April 2023."
        ],
        "valid_in": [
            2023
        ],
        "originated_from": [
            "AUTOCOUNT_DOTCOM_BERHAD"
        ],
        "status": "TO_BE_DEDUPLICATED",
        "created_at": "2025-08-20 11:09:38.776000",
        "last_modified_at": "2025-08-20 11:42:57.199000"
    },
    {
        "_id": "68a5acf2d3d7d9a2bb587493",
        "source_id": "68a5acf2d3d7d9a2bb58748f",
        "target_id": "68a5acf2d3d7d9

In [14]:
try:
    await graph_system.revert_deduplication_status(
        from_company="AUTOCOUNT_DOTCOM_BERHAD",
        from_status="UPSERTED_INTO_GRAPH_DB",
        to_status="TO_BE_DEDUPLICATED",
        collection_to_revert="RELATIONSHIPS"
    )
except Exception as e:
    graph_construction_logger.error(f"GraphConstructionSystem\nError:\n {e}")

2025-08-20 19:50:49,966 - graph_construction - INFO - Revert status successful for collection 'RELATIONSHIPS'. Updated 181 documents from status 'UPSERTED_INTO_GRAPH_DB' to 'TO_BE_DEDUPLICATED'.


In [None]:
# try:
#     await graph_system.resolve_entities_deduplication_pending_tasks(
#         from_company="AUTOCOUNT_DOTCOM_BERHAD",
#     )
# except Exception as e:
#     graph_construction_logger.error(f"GraphConstructionSystem\nError:\n {e}")

In [11]:
try:
   await graph_system.upsert_entities_into_pinecone(
      from_company="AUTOCOUNT_DOTCOM_BERHAD", batch_size=100
   )
except Exception as e:
    graph_construction_logger.error(f"GraphConstructionSystem\nError:\n {e}")

2025-08-20 19:45:28,315 - graph_construction - INFO - Starting batch upsert process for entities into Pinecone.
2025-08-20 19:45:40,689 - graph_construction - INFO - Successfully processed batch. Total entities processed so far: 100
2025-08-20 19:45:47,305 - graph_construction - INFO - Successfully processed batch. Total entities processed so far: 135
2025-08-20 19:45:47,346 - graph_construction - INFO - No more entities to process. Batch upsert completed.
2025-08-20 19:45:47,349 - graph_construction - INFO - GraphConstructionSystem
Finished upserting entities into Pinecone. Total entities processed: 135.


In [16]:
try:
    await graph_system.upsert_entities_and_relationships_into_neo4j(
        from_company="AUTOCOUNT_DOTCOM_BERHAD", batch_size=100
    )
except Exception as e:
    graph_construction_logger.error(f"GraphConstructionSystem\nError:\n {e}")

2025-08-20 19:55:44,656 - graph_construction - INFO - Starting batch upsert process for entities into Neo4j.
2025-08-20 19:55:44,693 - graph_construction - INFO - No more entities to process. Batch upsert completed.
2025-08-20 19:55:44,697 - graph_construction - INFO - GraphConstructionSystem
Finished upserting entities into Neo4j. Total entities processed: 0.
2025-08-20 19:55:44,700 - graph_construction - INFO - Starting batch upsert process for relationships into Neo4j.
2025-08-20 19:55:47,318 - graph_construction - INFO - Successfully processed batch. Total relationships processed so far: 100
2025-08-20 19:55:48,712 - graph_construction - INFO - Successfully processed batch. Total relationships processed so far: 176
2025-08-20 19:55:48,747 - graph_construction - INFO - No more relationships to process.
2025-08-20 19:55:48,749 - graph_construction - INFO - GraphConstructionSystem
Finished upserting relationships into Neo4j. Total relationships processed: 176.


In [None]:
result = await graph_system.get_formatted_similar_entities_from_pinecone(
    query_texts=["what is autocount"],
    top_k=20,
    query_filter={"type": {"$eq": "LegalEntity"}},
    score_threshold=0.0,
)

graph_construction_logger.info(result)