In [1]:
# Basic imports
import json
import os
from datetime import datetime, timezone
import uuid
from typing import List, Dict, Any, Optional

# OpenAI client (or your preferred LLM client)
from openai import AsyncOpenAI

# Weave for tracing (optional, if you're using it)
# import weave

# MongoDB
import motor
from motor.motor_asyncio import AsyncIOMotorClient
import motor.motor_asyncio # Make sure this import is at the top of your file
import pymongo # For ObjectId, and some potential sync operations or type hints

# XML Processing
from lxml import etree # Preferred for robust XML/XPath/Namespace handling

# Pydantic for structuring tool inputs and some internal models
from pydantic import BaseModel, Field

print("Imports successful.")

Imports successful.


In [2]:
# Akoma Ntoso v3.0 Namespace
AKN_NAMESPACE = "http://docs.oasis-open.org/legaldocml/ns/akn/3.0"
NSMAP = {None: AKN_NAMESPACE} # Default namespace for lxml

# --- Indonesian Legal Structure to AKN Mapping (Revised based on UU No.8/1961 example) ---
AKN_STRUCTURE_MAPPING_INFO = """
Akoma Ntoso (AKN) v3.0 Mapping for Indonesian Legal Documents:

**A. General Principles**:
1.  **Document Root**: For Undang-Undang (UU), use `<akomaNtoso><act name="indonesianAct" ...>`.
2.  **`eId` (Element ID) Strategy**:
    * Crucial for all addressable structural elements.
    * Hierarchical and predictable: `containerId__elementTypePrefix_number`.
    * Common prefixes:
        * `meta`, `identification`, `publication`
        * `preamble`, `rec` (recital), `cit` (citation), `formula`
        * `body`
        * `ch` (chapter), `sec` (section - for Bagian), `subsec` (subsection - for Paragraf structure)
        * `art` (article - for Pasal), `para` (paragraph - for Ayat)
        * `list` (for a list within an Ayat/paragraph), `point` (for an item 'a.', 'b.' in a list)
        * `conclusions`, `docGeneration`, `signatureBlock`, `signature`
        * `attachment`, `elucidation`, `elucidationGeneral`, `elucidationArticle`
    * Example: `body__art_1` (Pasal 1), `body__art_1__para_1` (Ayat 1 of Pasal 1), `body__art_1__para_1__list_1__point_a` (Point 'a' in Ayat 1 of Pasal 1).
    * Generate eIds sequentially within their parent. `art_1`, `art_2`, etc. `art_1__para_1`, `art_1__para_2`, etc.

**B. Document Structure Mapping**:

1.  **Metadata (`<meta>`)**:
    * `identification source="#source_for_main_doc"`: Contains FRBR elements.
        * `FRBRWork`: `FRBRthis value="/akn/id/act/YYYY/NN/main"`, `FRBRuri value="/akn/id/act/YYYY/NN"`, `FRBRdate date="YYYY-MM-DD" name="enactment"`, `FRBRauthor href="#author_ref"`, `FRBRcountry value="id"`, `FRBRname value="FULL_TITLE_TENTANG"`, `FRBRnumber value="NN"`, `FRBRprescriptive value="true"`.
        * `FRBRManifestation`: `FRBRthis value="/akn/id/act/YYYY/NN/id@YYYY-MM-DD/main.xml"`, `FRBRuri value="/akn/id/act/YYYY/NN/id@YYYY-MM-DD.xml"`, `FRBRdate date="YYYY-MM-DD" name="publication"`, `FRBRformat value="application/akn+xml"`.
    * `publication name="LembaranNegara" showAs="Lembaran Negara Republik Indonesia" date="YYYY-MM-DD" number="LN_NUMBER" TLA="LNRI" eId="meta__ln"`
    * `lifecycle source="#source_for_main_doc"`: Events like enactment.
    * `references source="#source_for_main_doc"`: Define authorities like `#presiden_ri`, `#dpr_gr`.

2.  **Preamble (`<preamble eId="preamble">`)**:
    * Opening line like "PRESIDEN REPUBLIK INDONESIA,": `<docTitle eId="preamble__doctitle_1">PRESIDEN REPUBLIK INDONESIA,</docTitle>` or similar.
    * `Menimbang:`: `<recitals eId="preamble__recitals_1">`. Each point `a. ...;` becomes a `<recital eId="preamble__recitals_1__rec_a"><num>a.</num><p>bahwa ...;</p></recital>`.
    * `Mengingat:`: `<citations eId="preamble__citations_1">`. Each point `a. ...;` becomes a `<citation eId="preamble__citations_1__cit_a"><num>a.</num><p>Pasal ...;</p></citation>`.
    * `Dengan persetujuan...;`: `<container name="approvalStatement" eId="preamble__approval_1"><p>Dengan persetujuan Dewan Perwakilan Rakyat Gotong Royong;</p></container>`
    * `MEMUTUSKAN :`: `<formula name="decision" eId="preamble__formula_decision"> <p>MEMUTUSKAN :</p> </formula>` (This formula might then contain the enacting clauses).
    * `I. Mencabut...;`: Within or after the decision formula, could be `<container name="revocation" eId="preamble__revocation_1"><num>I.</num><p>Mencabut: Undang-undang ... Lembaran-Negara ...);</p></container>`
    * `II. Menetapkan: UNDANG-UNDANG TENTANG ... .`: `<enactingFormula eId="preamble__enactingformula_1" name="enactingFormula"><num>II.</num><p>Menetapkan: <docType refersTo="#indonesianAct">UNDANG-UNDANG</docType> TENTANG <docTitle refersTo="#mainTitle">WAJIB KERJA SARJANA</docTitle>.</p></enactingFormula>`

3.  **Body (`<body eId="body">`)**:
    * `BAB X JUDUL BAB`: If present, `<chapter eId="body__ch_X" num="X"><num>BAB X</num><heading>JUDUL BAB</heading> ... </chapter>`.
    * `Bagian Kesatu JUDUL BAGIAN`: If present, `<section eId="body__ch_X__sec_Y" num="Y"><num>Bagian Kesatu</num><heading>JUDUL BAGIAN</heading> ... </section>`.
    * `Pasal N`: `<article eId="body__art_N" num="N"><num>Pasal N.</num><content> ... </content></article>`. If a Pasal has a title, use `<heading>Judul Pasal</heading>` after `<num>`.
    * `Ayat (M)`: Within `<article><content>`, use `<paragraph eId="body__art_N__para_M" num="(M)"><num>(M)</num><content><p>Teks ayat...</p></content></paragraph>`.
    * Lists within Ayat (e.g., `a. ...; b. ...;`): `<list eId="body__art_N__para_M__list_1" type="alpha"><point eId="body__art_N__para_M__list_1__point_a"><num>a.</num><content><p>Teks poin a...</p></content></point> <point eId="body__art_N__para_M__list_1__point_b"><num>b.</num><content><p>Teks poin b...</p></content></point></list>`.
    * Concluding sentences of an Ayat after a list are part of the last `<p>` of the list's final point, or a new `<p>` within the Ayat's `<content>` after the `<list>`.

4.  **Concluding Provisions (Usually last Articles in `<body>` or in `<conclusions>`)**:
    * `Pasal X` (Pelaksanaan): `<article eId="body__art_X" num="X" heading="Pelaksanaan"><num>Pasal X.</num><content><p>Pelaksanaan...</p></content></article>`. Could have a `role` attribute.
    * `Pasal Y` (Mulai Berlaku): `<article eId="body__art_Y" num="Y" heading="Ketentuan Peralihan"><num>Pasal Y.</num><content><p>Undang-undang ini mulai berlaku...</p></content></article>`.
    * Promulgation command: `<container name="promulgationCommand" eId="body__promulgation_1"><p>Agar supaya setiap orang dapat mengetahuinya...</p></container>`.

5.  **Signatures & Dates (`<conclusions eId="conclusions">`)**:
    * `<container name="placeDateOfSignature" eId="conclusions__sigblock_1__placedate_1"><p>Disahkan di Jakarta</p><p>pada tanggal 29 April 1961.</p></container>`
    * `<container name="signatureBlock" eId="conclusions__sigblock_1"> <signature eId="conclusions__sigblock_1__signature_1"><role refersTo="#presidenRI">Pejabat Presiden Republik Indonesia,</role><person refersTo="#djuanda">DJUANDA</person></signature> </container>`
    * Similar structure for "Diundangkan di..." and the Sekretaris Negara.

6.  **`PENJELASAN` (Elucidation)**:
    * Attached to the main document: `<attachments eId="attachments_1"> <attachment eId="attachments_1__elucidation_1" type="elucidation"> ... </attachment> </attachments>`.
    * The elucidation itself will have `meta`, `preface` (containing its title), and `body`.
    * `PENJELASAN UMUM.`: `<container name="generalExplanation" eId="attachments_1__elucidation_1__body__general_1"> <heading>PENJELASAN UMUM.</heading> <p>Teks...</p> ... </container>`.
    * `PENJELASAN PASAL DEMI PASAL.`: `<container name="articleByArticleExplanation" eId="attachments_1__elucidation_1__body__artbyart_1"> <heading>PENJELASAN PASAL DEMI PASAL.</heading> ... </container>`.
        * `Pasal 1.`: `<clause eId="attachments_1__elucidation_1__body__artbyart_1__art_1" refersTo="#body__art_1"> <heading>Pasal 1.</heading> <p>Teks penjelasan pasal 1...</p> </clause>`.
        * `Pasal 2 dan 3.`: `<clause eId="attachments_1__elucidation_1__body__artbyart_1__art_2-3" refersTo="#body__art_2 #body__art_3"> <heading>Pasal 2 dan 3.</heading> <p>Teks penjelasan...</p> </clause>`.

**C. Agent Instructions**:
* You MUST generate valid XML.
* For each new element, you MUST generate a unique and hierarchical `eId`.
* When text seems to continue from a previous chunk into an existing element (like a `<p>`), use the `update_akn_element` tool with `append_text_content`.
* When adding a new structural element (like a new `<paragraph>` (Ayat) or `<article>` (Pasal)), use the `add_akn_element` tool.
* Always refer to the existing document structure using `get_akn_document_context` to ensure correct placement and `eId` generation.
* State the `eId` of the element you are primarily working on or have just completed for context logging.
"""

print("Revised Akoma Ntoso configuration loaded.")
# For debugging or inspection:
# print(AKN_STRUCTURE_MAPPING_INFO)

Revised Akoma Ntoso configuration loaded.


In [3]:
# Ensure your API key is set as an environment variable
# OPENROUTER_API_KEY or OPENAI_API_KEY
# For OpenRouter:
client = AsyncOpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),
)
# Or for OpenAI:
# client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Select your preferred model
# LLM_MODEL = "openai/gpt-4o"
# LLM_MODEL = "openai/gpt-4-turbo"
LLM_MODEL = "google/gemini-2.5-flash-preview-05-20" # Example
# LLM_MODEL = "anthropic/claude-3.5-sonnet"

print(f"Using LLM Model: {LLM_MODEL}")

Using LLM Model: google/gemini-2.5-flash-preview-05-20


In [4]:
class MongoDBManager:
    def __init__(self, connection_string: str = "mongodb://localhost:27017/", db_name: str = "hukum_terbuka_akn"):
        self.connection_string = connection_string
        self.db_name = db_name
        self.async_client = None
        self.db = None
        self.documents_collection = None

    async def connect(self):
        """Connect to MongoDB and initialize db and collection attributes."""
        # Corrected condition:
        if (self.async_client is not None and
                self.db is not None and
                self.documents_collection is not None):
            try:
                await self.async_client.admin.command('ping')
                print(f"✅ MongoDB connection already active for: {self.db_name}")
                return True
            except Exception as e: # Catch specific exceptions if possible
                print(f"Stale connection detected (ping failed: {e}), attempting to reconnect.")
                await self.close() # Close existing (stale) client first, this will nullify attributes

        print(f"Attempting to connect to MongoDB: {self.db_name}...")
        try:
            self.async_client = motor.motor_asyncio.AsyncIOMotorClient(self.connection_string, serverSelectionTimeoutMS=5000)
            await self.async_client.admin.command('ping') # Verify connection
            self.db = self.async_client[self.db_name]
            self.documents_collection = self.db["legal_documents_akn"]
            print(f"✅ Connected to MongoDB: {self.db_name}, collection: {self.documents_collection.name}")
            return True
        except Exception as e:
            print(f"❌ MongoDB connection failed: {e}")
            if self.async_client: # pragma: no cover
                self.async_client.close()
            self.async_client = None
            self.db = None
            self.documents_collection = None
            return False

    async def close(self):
        """Close MongoDB connection and reset attributes."""
        if self.async_client:
            self.async_client.close()
            print("MongoDB connection closed.")
        self.async_client = None
        self.db = None
        self.documents_collection = None

# Initialize MongoDB manager
mongo_manager = MongoDBManager()

# Example of how to connect (will be called later in the main workflow)
# await mongo_manager.connect()

In [5]:
tools_schemas = [
    {
        "type": "function",
        "function": {
            "name": "initialize_akn_document",
            "description": "Initializes a new Akoma Ntoso document skeleton in the database with basic metadata. Should be called only once for a new document, typically when processing the first chunk.",
            "parameters": {
                "type": "object",
                "properties": {
                    "document_id": {"type": "string", "description": "Unique identifier for the document (e.g., UU_8_1961)."},
                    "source_file": {"type": "string", "description": "Filename of the source text document."},
                    "doc_type_hint": {"type": "string", "description": "Hint for the AKN document type (e.g., 'act', 'bill', 'judgment'). Defaults to 'act'."},
                    "initial_metadata": {
                        "type": "object",
                        "description": "Basic metadata extracted from the first part of the document.",
                        "properties": {
                            "title": {"type": "string", "description": "Official title (e.g., 'WAJIB KERJA SARJANA')."},
                            "number": {"type": "string", "description": "Document number (e.g., '8')."},
                            "year": {"type": "string", "description": "Year of the document (e.g., '1961')."},
                            "doc_subtype": {"type": ["string", "null"], "description": "Subtype like 'UNDANG-UNDANG', 'PERATURAN PEMERINTAH'."},
                            "country_code": {"type": "string", "default": "id", "description": "ISO 3166-1 alpha-2 country code, defaults to 'id'."},
                            "author_placeholder": {"type": ["string", "null"], "description": "Placeholder for authoring authority if known."}
                        },
                        "required": ["title", "number", "year", "doc_subtype"]
                    }
                },
                "required": ["document_id", "source_file", "initial_metadata"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "get_akn_document_context",
            "description": "Retrieves parts of the current AKN XML document to provide context to the agent. Can fetch the full document, a specific element by eId, or its parent and siblings.",
            "parameters": {
                "type": "object",
                "properties": {
                    "document_id": {"type": "string", "description": "Unique identifier for the document."},
                    "target_eId": {"type": ["string", "null"], "description": "Optional eId of the element to focus on. If null, might return high-level structure or last modified area."},
                    "context_type": {
                        "type": "string",
                        "enum": ["full", "element_only", "element_with_children", "parent_and_siblings"],
                        "default": "element_with_children",
                        "description": "Type of context to retrieve if target_eId is specified."
                    },
                    "max_depth": {"type": "integer", "default": 2, "description": "Max depth of children to retrieve if 'element_with_children' is chosen."}
                },
                "required": ["document_id"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "add_akn_element",
            "description": "Adds a new Akoma Ntoso element (as an XML string) as a child of a specified parent element or as a sibling to another element within the document.",
            "parameters": {
                "type": "object",
                "properties": {
                    "document_id": {"type": "string", "description": "Unique identifier for the document."},
                    "parent_eId": {"type": ["string","null"], "description": "The eId of the parent element to which the new element will be appended as a child. Use 'root' to add to the main document element if appropriate (e.g. for preamble, body, conclusions under <act>). If adding as sibling, this might be the parent of the sibling."},
                    "new_element_akn_xml": {"type": "string", "description": "The well-formed AKN XML string for the new element to be added (e.g., '<paragraph eId=\"new_para_1\"><num>(x)</num><content><p>Text</p></content></paragraph>'). Ensure it's valid XML and adheres to AKN structure."},
                    "sibling_eId": {"type": ["string", "null"], "description": "Optional eId of an existing sibling element. If provided, 'insert_position' determines placement relative to this sibling."},
                    "insert_position": {
                        "type": "string",
                        "enum": ["append_to_parent", "before_sibling", "after_sibling"],
                        "default": "append_to_parent",
                        "description": "Position to insert the new element. 'append_to_parent' adds as last child of parent_eId. 'before_sibling' and 'after_sibling' require sibling_eId and parent_eId (parent of the sibling)."
                    }
                },
                "required": ["document_id", "new_element_akn_xml"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "update_akn_element",
            "description": "Updates an existing Akoma Ntoso element in the document. Can update attributes or text content.",
            "parameters": {
                "type": "object",
                "properties": {
                    "document_id": {"type": "string", "description": "Unique identifier for the document."},
                    "target_eId": {"type": "string", "description": "The eId of the AKN element to modify."},
                    "new_attributes": {"type": ["object", "null"], "description": "A dictionary of attribute key-value pairs to set or update on the element. Values should be strings. E.g. {\"num\": \"(new)\", \"status\": \"updated\"}."},
                    "new_text_content": {"type": ["string", "null"], "description": "If provided, replaces the direct text content of the element. Be cautious with elements that have mixed content or child elements containing text."},
                    "append_text_content": {"type": ["string", "null"], "description": "If provided, appends this text to the existing direct text content of the element. Useful for content spanning chunks. Ensure target element is appropriate for direct text appending (e.g., a <p> tag)."}
                },
                "required": ["document_id", "target_eId"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "store_chunk_processing_info",
            "description": "Stores information about the processing of a specific chunk, including a summary and the eId of the last AKN element significantly handled in that chunk.",
            "parameters": {
                "type": "object",
                "properties": {
                    "document_id": {"type": "string", "description": "Unique identifier for the document."},
                    "chunk_id": {"type": "string", "description": "Identifier of the processed chunk."},
                    "analysis_summary": {"type": "string", "description": "A brief summary of what was identified and done in this chunk."},
                    "last_processed_akn_eId": {"type": "string", "description": "The eId of the primary Akoma Ntoso element that was last created or modified in this chunk. This helps in resuming context for the next chunk."},
                    "errors_or_notes": {"type": ["string", "null"], "description": "Any errors encountered or specific notes from the agent for this chunk."}
                },
                "required": ["document_id", "chunk_id", "analysis_summary", "last_processed_akn_eId"]
            }
        }
    }
]
print(f"Defined {len(tools_schemas)} tool schemas.")

Defined 5 tool schemas.


In [6]:
# --- XML Helper Functions (using lxml) ---

def parse_xml_string(xml_string: str) -> Optional[etree._Element]:
    """Parses an XML string into an lxml Element object."""
    try:
        parser = etree.XMLParser(remove_blank_text=True, strip_cdata=False)
        return etree.fromstring(xml_string.encode('utf-8'), parser=parser)
    except etree.XMLSyntaxError as e:
        print(f"XMLSyntaxError: {e}. Problematic XML: {xml_string[:500]}...")
        return None

def xml_to_string(element: etree._Element, pretty_print=True) -> str:
    """Converts an lxml Element object back to a string."""
    return etree.tostring(element, pretty_print=pretty_print, encoding="unicode", xml_declaration=False)

def find_element_by_eid(root: etree._Element, eid: str) -> Optional[etree._Element]:
    """Finds an element by its eId attribute using XPath."""
    if root is None:
        return None
    found_elements = root.xpath(f".//*[@eId='{eid}']")
    if found_elements:
        return found_elements[0]
    return None

# --- Tool Implementations ---
TOOL_MAPPING = {}

async def initialize_akn_document(document_id: str, source_file: str, doc_type_hint: str = "act", initial_metadata: dict = None) -> str:
    """
    Initializes a new Akoma Ntoso document skeleton in MongoDB.
    """
    # CORRECTED CHECK:
    if mongo_manager.documents_collection is None:
        return "Error: MongoDB not connected or collection not initialized."
    if initial_metadata is None:
        initial_metadata = {}

    existing_doc = await mongo_manager.documents_collection.find_one({"document_id": document_id})
    if existing_doc:
        return f"Error: Document {document_id} already exists. Cannot re-initialize."

    akn_root = etree.Element(etree.QName(AKN_NAMESPACE, "akomaNtoso"), nsmap=NSMAP)
    doc_element_name = doc_type_hint
    doc_el = etree.SubElement(akn_root, etree.QName(AKN_NAMESPACE, doc_element_name), eId=doc_element_name)
    doc_el.set("name", "indonesianAct")

    meta_el = etree.SubElement(doc_el, etree.QName(AKN_NAMESPACE, "meta"), eId="meta")
    identification_el = etree.SubElement(meta_el, etree.QName(AKN_NAMESPACE, "identification"), eId="meta__identification", source="#sourceSystem")

    frbr_work_el = etree.SubElement(identification_el, etree.QName(AKN_NAMESPACE, "FRBRWork"), eId="meta__identification__work")
    year = initial_metadata.get("year", "0000")
    number = initial_metadata.get("number", "0")
    title = initial_metadata.get("title", "Unknown Title")
    country = initial_metadata.get("country_code", "id")
    
    etree.SubElement(frbr_work_el, etree.QName(AKN_NAMESPACE, "FRBRthis"), value=f"/akn/{country}/{doc_type_hint}/{year}/{number}/main")
    etree.SubElement(frbr_work_el, etree.QName(AKN_NAMESPACE, "FRBRuri"), value=f"/akn/{country}/{doc_type_hint}/{year}/{number}")
    etree.SubElement(frbr_work_el, etree.QName(AKN_NAMESPACE, "FRBRdate"), date=f"{year}-01-01", name="creation")
    etree.SubElement(frbr_work_el, etree.QName(AKN_NAMESPACE, "FRBRauthor"), href="#system", as_="#author") # Corrected 'as' to 'as_'
    etree.SubElement(frbr_work_el, etree.QName(AKN_NAMESPACE, "FRBRcountry"), value=country)
    if title:
         etree.SubElement(frbr_work_el, etree.QName(AKN_NAMESPACE, "FRBRname"), value=str(title))
    if number:
        etree.SubElement(frbr_work_el, etree.QName(AKN_NAMESPACE, "FRBRnumber"), value=str(number))

    frbr_manifestation_el = etree.SubElement(identification_el, etree.QName(AKN_NAMESPACE, "FRBRManifestation"), eId="meta__identification__manifestation")
    etree.SubElement(frbr_manifestation_el, etree.QName(AKN_NAMESPACE, "FRBRthis"), value=f"/akn/{country}/{doc_type_hint}/{year}/{number}/id@main.xml")
    etree.SubElement(frbr_manifestation_el, etree.QName(AKN_NAMESPACE, "FRBRuri"), value=f"/akn/{country}/{doc_type_hint}/{year}/{number}/id.xml")
    etree.SubElement(frbr_manifestation_el, etree.QName(AKN_NAMESPACE, "FRBRdate"), date=datetime.now(timezone.utc).strftime("%Y-%m-%d"), name="publication")
    etree.SubElement(frbr_manifestation_el, etree.QName(AKN_NAMESPACE, "FRBRformat"), value="application/akn+xml")

    etree.SubElement(doc_el, etree.QName(AKN_NAMESPACE, "preamble"), eId="preamble")
    etree.SubElement(doc_el, etree.QName(AKN_NAMESPACE, "body"), eId="body")
    etree.SubElement(doc_el, etree.QName(AKN_NAMESPACE, "conclusions"), eId="conclusions")
    etree.SubElement(doc_el, etree.QName(AKN_NAMESPACE, "attachments"), eId="attachments")

    akn_xml_string = xml_to_string(akn_root)

    doc_to_insert = {
        "document_id": document_id,
        "source_file": source_file,
        "document_metadata_extracted": initial_metadata,
        "akn_xml_string": akn_xml_string,
        "processing_state": {
            "current_chunk_id": None,
            "last_processed_akn_eId": doc_element_name,
            "status": "initialized"
        },
        "chunk_processing_log": [],
        "created_at": datetime.now(timezone.utc),
        "updated_at": datetime.now(timezone.utc)
    }
    try:
        await mongo_manager.documents_collection.insert_one(doc_to_insert)
        return f"Successfully initialized Akoma Ntoso document '{document_id}' in database."
    except Exception as e:
        return f"Error initializing document '{document_id}': {str(e)}"
TOOL_MAPPING["initialize_akn_document"] = initialize_akn_document


async def get_akn_document_context(document_id: str, target_eId: Optional[str] = None,
                                 context_type: str = "element_with_children",
                                 max_depth: int = 2) -> str:
    # CORRECTED CHECK:
    if mongo_manager.documents_collection is None:
        return "Error: MongoDB not connected or collection not initialized."

    doc_data = await mongo_manager.documents_collection.find_one({"document_id": document_id})
    if not doc_data or "akn_xml_string" not in doc_data:
        return f"Error: Document '{document_id}' not found or has no AKN XML."

    akn_xml_string = doc_data["akn_xml_string"]
    root = parse_xml_string(akn_xml_string)
    if root is None:
        return "Error: Could not parse stored AKN XML."

    if context_type == "full" or target_eId is None:
        if len(akn_xml_string) > 3000 and target_eId is None :
            preamble = find_element_by_eid(root, "preamble")
            body = find_element_by_eid(root, "body")
            conclusions = find_element_by_eid(root, "conclusions")
            summary_xml_parts = []
            if preamble is not None: summary_xml_parts.append(xml_to_string(preamble, pretty_print=False))
            if body is not None: summary_xml_parts.append(xml_to_string(body, pretty_print=False))
            if conclusions is not None: summary_xml_parts.append(xml_to_string(conclusions, pretty_print=False))
            return f"<akomaNtoso>...<summary>{''.join(summary_xml_parts)}</summary>...</akomaNtoso> Current document length: {len(akn_xml_string)} chars."
        return xml_to_string(root)

    element_to_return = find_element_by_eid(root, target_eId)
    if element_to_return is None:
        return f"Error: Element with eId '{target_eId}' not found in document '{document_id}'."

    if context_type == "element_only":
        temp_root = etree.Element(etree.QName(AKN_NAMESPACE, "contextSnippet"), nsmap=NSMAP)
        temp_root.append(etree.fromstring(etree.tostring(element_to_return)))
        return xml_to_string(temp_root)
    elif context_type == "element_with_children":
        temp_root = etree.Element(etree.QName(AKN_NAMESPACE, "contextSnippet"), nsmap=NSMAP)
        temp_root.append(etree.fromstring(etree.tostring(element_to_return)))
        return xml_to_string(temp_root)
    elif context_type == "parent_and_siblings":
        parent = element_to_return.getparent()
        if parent is None:
            return xml_to_string(element_to_return)
        temp_root = etree.Element(etree.QName(AKN_NAMESPACE, "contextSnippet"), nsmap=NSMAP)
        parent_copy = etree.Element(parent.tag, nsmap=NSMAP, attrib=parent.attrib)
        for child in parent:
            parent_copy.append(etree.fromstring(etree.tostring(child)))
        temp_root.append(parent_copy)
        return xml_to_string(temp_root)
    return "Error: Invalid context_type specified."
TOOL_MAPPING["get_akn_document_context"] = get_akn_document_context


async def add_akn_element(document_id: str, new_element_akn_xml: str,
                        parent_eId: Optional[str] = None,
                        sibling_eId: Optional[str] = None,
                        insert_position: str = "append_to_parent") -> str:
    # CORRECTED CHECK:
    if mongo_manager.documents_collection is None:
        return "Error: MongoDB not connected or collection not initialized."

    doc_data = await mongo_manager.documents_collection.find_one({"document_id": document_id})
    if not doc_data or "akn_xml_string" not in doc_data:
        return f"Error: Document '{document_id}' not found."

    current_xml_str = doc_data["akn_xml_string"]
    root = parse_xml_string(current_xml_str)
    if root is None: return "Error: Could not parse stored AKN XML."

    try:
        new_element = parse_xml_string(new_element_akn_xml)
        if new_element is None: return f"Error: Could not parse new_element_akn_xml: {new_element_akn_xml[:100]}"
    except Exception as e:
        return f"Error parsing new_element_akn_xml: {str(e)}. Snippet: {new_element_akn_xml[:100]}"

    target_parent = None
    new_element_eId = new_element.get("eId", "unknownNewElement")

    if insert_position == "append_to_parent":
        if not parent_eId: return f"Error: parent_eId is required for 'append_to_parent'."
        
        doc_root_tag_name = root[0].tag # e.g. {http://...}act
        doc_root_eid = root[0].get("eId")

        if parent_eId == doc_root_eid: # If parent_eId refers to the main doc element like <act>
            target_parent = root[0]
        else:
            target_parent = find_element_by_eid(root, parent_eId)

        if target_parent is None: return f"Error: Parent element with eId '{parent_eId}' not found."
        target_parent.append(new_element)

    elif insert_position in ["before_sibling", "after_sibling"]:
        if not sibling_eId: return f"Error: sibling_eId is required for '{insert_position}'."
        target_sibling = find_element_by_eid(root, sibling_eId)
        if target_sibling is None: return f"Error: Sibling element with eId '{sibling_eId}' not found."
        target_parent = target_sibling.getparent()
        if target_parent is None: return f"Error: Sibling element '{sibling_eId}' has no parent (is root?)."

        if insert_position == "before_sibling":
            target_sibling.addprevious(new_element)
        else: # after_sibling
            target_sibling.addnext(new_element)
    else:
        return f"Error: Invalid insert_position '{insert_position}'."

    updated_xml_str = xml_to_string(root)
    try:
        await mongo_manager.documents_collection.update_one(
            {"document_id": document_id},
            {"$set": {"akn_xml_string": updated_xml_str, "updated_at": datetime.now(timezone.utc)}}
        )
        return f"Successfully added element '{new_element_eId}' to document '{document_id}'."
    except Exception as e:
        return f"Error updating document in DB after adding element: {str(e)}"
TOOL_MAPPING["add_akn_element"] = add_akn_element


async def update_akn_element(document_id: str, target_eId: str,
                           new_attributes: Optional[dict] = None,
                           new_text_content: Optional[str] = None,
                           append_text_content: Optional[str] = None) -> str:
    # CORRECTED CHECK:
    if mongo_manager.documents_collection is None:
        return "Error: MongoDB not connected or collection not initialized."

    doc_data = await mongo_manager.documents_collection.find_one({"document_id": document_id})
    if not doc_data or "akn_xml_string" not in doc_data:
        return f"Error: Document '{document_id}' not found."

    current_xml_str = doc_data["akn_xml_string"]
    root = parse_xml_string(current_xml_str)
    if root is None: return "Error: Could not parse stored AKN XML."

    target_element = find_element_by_eid(root, target_eId)
    if target_element is None: return f"Error: Element with eId '{target_eId}' not found."

    if new_attributes:
        for key, value in new_attributes.items():
            target_element.set(key, str(value))

    if new_text_content is not None:
        target_element.text = new_text_content
        target_element.tail = None
        for child in list(target_element):
            target_element.remove(child)

    if append_text_content:
        if target_element.text:
            target_element.text += append_text_content
        else:
            target_element.text = append_text_content
    
    updated_xml_str = xml_to_string(root)
    try:
        await mongo_manager.documents_collection.update_one(
            {"document_id": document_id},
            {"$set": {"akn_xml_string": updated_xml_str, "updated_at": datetime.now(timezone.utc)}}
        )
        return f"Successfully updated element '{target_eId}' in document '{document_id}'."
    except Exception as e:
        return f"Error updating document in DB after updating element: {str(e)}"
TOOL_MAPPING["update_akn_element"] = update_akn_element


async def store_chunk_processing_info(document_id: str, chunk_id: str,
                                    analysis_summary: str, last_processed_akn_eId: str,
                                    errors_or_notes: Optional[str] = None) -> str:
    # CORRECTED CHECK:
    if mongo_manager.documents_collection is None:
        return "Error: MongoDB not connected or collection not initialized."

    log_entry = {
        "chunk_id": chunk_id,
        "analysis_summary": analysis_summary,
        "last_processed_akn_eId_at_end_of_chunk": last_processed_akn_eId,
        "errors_or_notes": errors_or_notes,
        "timestamp": datetime.now(timezone.utc)
    }
    
    update_query = {
        "$push": {"chunk_processing_log": log_entry},
        "$set": {
            "processing_state.current_chunk_id": chunk_id,
            "processing_state.last_processed_akn_eId": last_processed_akn_eId,
            "processing_state.status": "processing",
            "updated_at": datetime.now(timezone.utc)
        }
    }
    try:
        result = await mongo_manager.documents_collection.update_one(
            {"document_id": document_id},
            update_query
        )
        if result.matched_count == 0:
            return f"Error: Document '{document_id}' not found for storing chunk info."
        return f"Successfully stored processing info for chunk '{chunk_id}' of document '{document_id}'."
    except Exception as e:
        return f"Error storing chunk processing info: {str(e)}"
TOOL_MAPPING["store_chunk_processing_info"] = store_chunk_processing_info

print(f"Defined {len(TOOL_MAPPING)} tool implementations with corrected MongoDB checks.")

Defined 5 tool implementations with corrected MongoDB checks.


In [7]:
# System prompt for the Akoma Ntoso Document Building Agent

# We'll insert the AKN_STRUCTURE_MAPPING_INFO directly into the prompt.
# Ensure AKN_STRUCTURE_MAPPING_INFO is defined from Cell 2.

agent_system_prompt = f"""
You are an expert legislative informatics assistant specializing in converting Indonesian legal documents into Akoma Ntoso (AKN) v3.0 XML format.
Your primary goal is to analyze chunks of an Indonesian legal text and incrementally construct a well-formed and semantically rich Akoma Ntoso XML document.
You MUST strictly adhere to Akoma Ntoso structure and use the provided mappings and eId strategy.
The document will be built incrementally. Always consider the current state of the AKN XML document (obtained via tools) and the context of the last processed element.

**Core Task:**
For each chunk of text provided:
1.  Analyze the text to identify its structural and semantic meaning according to Indonesian legislative drafting conventions.
2.  Determine how this text fits into the existing Akoma Ntoso XML document structure. It might be:
    * The start of a new major section (e.g., preamble, body, conclusions, attachment).
    * A new structural element (e.g., a new Pasal/article, Ayat/paragraph, BAB/chapter, Menimbang/recital).
    * Content continuing an existing element (e.g., text of a `<p>` tag that spanned across chunks).
    * Metadata to be added or updated.
3.  Construct the appropriate Akoma Ntoso XML snippet(s) for the identified text.
4.  Use the available tools to:
    * Fetch context from the current AKN document (`get_akn_document_context`).
    * Add new elements (`add_akn_element`).
    * Update existing elements or their content (`update_akn_element`).
    * Initialize the document if it's the very first chunk (`initialize_akn_document`).
5.  After processing a chunk, record what you did using `store_chunk_processing_info`, specifying the `last_processed_akn_eId`.

**Akoma Ntoso Mapping and `eId` Strategy Details:**
{AKN_STRUCTURE_MAPPING_INFO}

**Workflow Example (Conceptual):**
* **User provides Chunk X.**
* **You (Agent):**
    1.  "Okay, I'm analyzing Chunk X. The previous chunk ended at eId `body__art_1__para_2`."
    2.  "I'll call `get_akn_document_context` for `body__art_1__para_2` or its parent `body__art_1` to confirm the structure."
    3.  (Tool returns context XML)
    4.  "The text in Chunk X starts with '(3) Ini adalah ayat ketiga...'. This is a new Ayat for Pasal 1."
    5.  "I will construct the AKN for this Ayat: `<paragraph eId=\"body__art_1__para_3\" num=\"(3)\"><num>(3)</num><content><p>Ini adalah ayat ketiga...</p></content></paragraph>`."
    6.  "I will call `add_akn_element` with `parent_eId=\"body__art_1__content\"` (assuming content wrapper) and the new paragraph XML."
    7.  (Tool confirms success)
    8.  "The main element processed in this chunk is `body__art_1__para_3`."
    9.  "I will call `store_chunk_processing_info` with `last_processed_akn_eId=\"body__art_1__para_3\"`."

**Important Rules for Interaction:**
* **Validity:** Generated XML snippets MUST be well-formed.
* **`eId`s:** Crucial. Generate them carefully, hierarchically, and ensure they are unique for each new element. Refer to the `eId` strategy.
* **Namespaces:** All AKN elements must be in the namespace `{AKN_NAMESPACE}`. Your tool interactions will handle this if you provide unprefixed tags in your XML strings for elements (e.g. `<paragraph>...</paragraph>`).
* **Context is Key:** Before adding or updating, understand where you are in the document by using `get_akn_document_context`.
* **Text Content:** Place general text content within `<p>` tags, typically nested inside a `<content>` element of a structural block like `<paragraph>` or `<article>`.
* **Continuations:** If text in the current chunk continues an element from a previous chunk (e.g., a long paragraph), use `update_akn_element` with `append_text_content` on the relevant `<p>` element's `eId`.
* **Logging:** Clearly state your actions and the `eId`s you are working with in your thought process and when calling `store_chunk_processing_info`.
* **Conservatism:** If a structure is highly ambiguous, make a reasonable choice based on AKN principles, and if possible, add an AKN comment `` within the XML you generate, or note it in `store_chunk_processing_info`.

Your response should primarily be a sequence of tool calls to build the document. Explain your reasoning briefly before each tool call if necessary.
The final Akoma Ntoso document will be assembled from your incremental modifications.
"""

print("Agent system prompt defined.")
# For debugging or inspection:
# print(agent_system_prompt)

Agent system prompt defined.


In [8]:
def simple_chunk_text(text: str, chunk_size: int = 2000, overlap: int = 200) -> List[Dict[str, Any]]:
    """
    Simple chunking function that splits text every chunk_size characters
    with optional overlap to maintain context.
    """
    chunks_data = []
    start = 0
    chunk_id_counter = 1

    while start < len(text):
        end = start + chunk_size
        chunk_text = text[start:end]

        chunks_data.append({
            "chunk_id": f"chunk_{chunk_id_counter:03d}", # Consistent chunk ID
            "start_pos": start,
            "end_pos": min(end, len(text)),
            "content": chunk_text,
            "char_count": len(chunk_text)
        })

        # Move start position, accounting for overlap
        # Ensure start doesn't go backward if overlap is larger than (end - start) for last chunk
        if end < len(text):
            start = end - overlap
            if start < (min(end, len(text)) - chunk_size + overlap): # safety if chunk was smaller
                 start = min(end, len(text)) - chunk_size + overlap
            if start < 0 : start = 0 # should not happen with positive overlap
        else:
            start = len(text)

        chunk_id_counter += 1

        # Safety break to avoid infinite loops, e.g., if overlap logic is flawed
        if chunk_id_counter > 2000:  # Adjust as needed, depends on document size
            print("Warning: Exceeded maximum chunk iterations (2000). Breaking.")
            break
    return chunks_data

print("Chunking function defined.")

Chunking function defined.


In [9]:
async def run_agentic_loop(
    system_prompt_content: str,
    user_prompt_content: str,
    tools_list: list, # List of tool schemas
    tool_function_mapping: dict, # Mapping of tool name to async function
    llm_model_name: str,
    # response_format_config: Optional[dict] = None # If you need specific JSON output from LLM beyond tool calls
    max_iterations: int = 20 # Safety break for too many tool calls for a single user prompt
):
    messages = [
        {"role": "system", "content": system_prompt_content},
        {"role": "user", "content": user_prompt_content}
    ]
    print(f"----\nSYSTEM: {system_prompt_content[:200]}...\nUSER: {user_prompt_content[:300]}...\n----")

    for i in range(max_iterations):
        print(f"\nIteration {i+1}")
        response = await client.chat.completions.create(
            model=llm_model_name,
            messages=messages,
            tools=tools_list,
            tool_choice="auto", # or "required" if agent must use a tool
            # response_format=response_format_config # If used
        )

        message = response.choices[0].message
        messages.append(message.model_dump(exclude_none=True)) # Add assistant's message

        if message.tool_calls:
            print(f"ASSISTANT (Tool Calls):")
            for tool_call in message.tool_calls:
                tool_name = tool_call.function.name
                tool_args_str = tool_call.function.arguments
                print(f"  - Calling tool: {tool_name}")
                print(f"    Arguments: {tool_args_str}")

                if tool_name not in tool_function_mapping:
                    tool_result_content = f"Error: Tool '{tool_name}' not found."
                    print(f"    Error: Tool '{tool_name}' not found.")
                else:
                    try:
                        tool_args = json.loads(tool_args_str)
                        tool_function = tool_function_mapping[tool_name]
                        # Check if arguments are expected as **kwargs or single dict
                        # Assuming Pydantic models in tools schemas lead to dicts that can be unpacked
                        tool_result_content = await tool_function(**tool_args)
                    except json.JSONDecodeError:
                        tool_result_content = f"Error: Invalid JSON in arguments for {tool_name}: {tool_args_str}"
                        print(f"    Error: Invalid JSON arguments for {tool_name}.")
                    except Exception as e:
                        tool_result_content = f"Error executing tool {tool_name}: {str(e)}"
                        print(f"    Error executing tool {tool_name}: {e}")

                print(f"    Tool Result ({tool_name}): {str(tool_result_content)[:300]}...")
                messages.append({
                    "role": "tool",
                    "tool_call_id": tool_call.id,
                    "name": tool_name,
                    "content": str(tool_result_content) # Ensure content is string
                })
        else:
            # No tool calls, agent provides a direct textual response
            final_response_content = message.content
            print(f"ASSISTANT (Final Response): {final_response_content}")
            return final_response_content # Or potentially the full messages log

        # Optional: Check if a specific "finish" signal is in the assistant message or tool result
        # For now, we rely on max_iterations or the LLM deciding it's done (no more tool_calls)

    return "Max iterations reached. Agent may not have finished."


print("Agent loop function defined.")

Agent loop function defined.


In [10]:
# --- Main Workflow Orchestration ---

async def process_document(
    document_text_content: str,
    doc_id: str,
    doc_source_filename: str,
    initial_doc_metadata: dict, # e.g., {"title": "...", "number": "...", "year": "...", "doc_subtype": "..."}
    chunk_s: int = 2500, # Adjusted chunk size, AKN context can be verbose
    chunk_o: int = 300   # Adjusted overlap
):
    print(f"Starting processing for document: {doc_id}")

    # 1. Connect to MongoDB
    if await mongo_manager.connect() is None:
        print("Failed to connect to MongoDB. Aborting.")
        return None

    # 2. Chunk the document
    text_chunks = simple_chunk_text(document_text_content, chunk_size=chunk_s, overlap=chunk_o)
    print(f"Document divided into {len(text_chunks)} chunks.")

    # 3. Initialize AKN document in DB (if not already present)
    # This tool checks for existence. For a fresh run, it will create.
    # For a restart, it should ideally pick up. We'll assume a fresh run logic first.
    # The agent should call this on the first chunk.

    # 4. Process each chunk with the agent
    current_processing_state = {"last_processed_akn_eId": "act"} # Initial context for the first chunk

    for i, chunk_data in enumerate(text_chunks):
        print(f"\n--- Processing Chunk {i+1}/{len(text_chunks)} (ID: {chunk_data['chunk_id']}) ---")
        print(f"Content (first 100 chars): {chunk_data['content'][:100]}...")

        # Construct user prompt for the agent
        user_prompt = f"""
Analyze the following chunk of an Indonesian legal document (ID: {chunk_data['chunk_id']}).
The document ID is '{doc_id}'.
The previously processed Akoma Ntoso element eId was: '{current_processing_state.get("last_processed_akn_eId", "act")}'.

Your task is to integrate this chunk's content into the Akoma Ntoso XML document.
Use tools to fetch context, add/update elements, and log your progress.
If this is the very first chunk (chunk_001), you MUST call `initialize_akn_document` first.

Chunk Content:
---
{chunk_data['content']}
---
Remember to call `store_chunk_processing_info` at the end of your operations for this chunk.
Determine the correct AKN structure and eIds based on the system guidelines and the content.
"""

        # Run the agent loop for this chunk
        # The loop should ideally return the final message or an indicator of completion.
        # For simplicity, we'll let it run and tools will update DB.
        # The `run_agentic_loop` will print interactions.
        agent_response_or_status = await run_agentic_loop(
            system_prompt_content=agent_system_prompt,
            user_prompt_content=user_prompt,
            tools_list=tools_schemas, # Defined in Cell 5
            tool_function_mapping=TOOL_MAPPING, # Populated in Cell 6
            llm_model_name=LLM_MODEL # Defined in Cell 3
        )
        print(f"Agent interaction for chunk {chunk_data['chunk_id']} ended. Status/Response: {str(agent_response_or_status)[:200]}...")

        # After agent processing, get the updated last_processed_eId for the next iteration
        # The agent should have stored this via `store_chunk_processing_info`.
        # We can fetch it from DB for true statefulness.
        doc_db_state = await mongo_manager.documents_collection.find_one(
            {"document_id": doc_id},
            {"processing_state": 1}
        )
        if doc_db_state and "processing_state" in doc_db_state:
            current_processing_state = doc_db_state["processing_state"]
            print(f"Updated processing state from DB: Last eId '{current_processing_state.get('last_processed_akn_eId')}'")
        else:
            print(f"Warning: Could not retrieve updated processing state for {doc_id} after chunk {chunk_data['chunk_id']}.")
            # Potentially break or use agent's last reported eId if available from agent_response_or_status

    print(f"\n--- Document processing for {doc_id} complete. ---")

    # 5. Retrieve the final Akoma Ntoso XML
    final_doc_data = await mongo_manager.documents_collection.find_one({"document_id": doc_id})
    if final_doc_data and "akn_xml_string" in final_doc_data:
        print("\nFinal Akoma Ntoso XML (first 1000 chars):")
        print(final_doc_data["akn_xml_string"][:1000])
        # Save to file
        output_filename = f"{doc_id}_final.akn.xml"
        with open(output_filename, "w", encoding="utf-8") as f:
            f.write(final_doc_data["akn_xml_string"])
        print(f"Full Akoma Ntoso XML saved to: {output_filename}")
        return final_doc_data["akn_xml_string"]
    else:
        print(f"Error: Could not retrieve final Akoma Ntoso XML for {doc_id}.")
        return None

    # 6. Close MongoDB connection (or manage it externally if running multiple docs)
    # await mongo_manager.close() # Best to manage connection lifecycle outside for multiple runs

print("Main workflow orchestration function defined.")

Main workflow orchestration function defined.


In [11]:
# --- Load an example document and run the processing ---

# Path to your raw text file
# Make sure this file is accessible in your Jupyter environment
# For example, if your notebook is in 'notebooks/' and your text file is in 'raw_data/':
# sample_doc_path = "../raw_data/UU_8_1961.txt"
# Or if it's in the same directory:
sample_doc_path = "../../raw/UU_8_1961.txt" # <--- ADJUST THIS PATH TO YOUR FILE

document_id_main = "UU_8_1961_vAkn" # Unique ID for this processing run / document version

# Basic metadata - ideally, the agent might extract some of this from the first chunk too,
# but providing it helps bootstrap the FRBR identification.
# Based on the raw text:
initial_metadata_main = {
    "title": "WAJIB KERJA SARJANA",
    "number": "8",
    "year": "1961",
    "doc_subtype": "UNDANG-UNDANG", # This could be used for docType in AKN
    "country_code": "id"
    # Add other known metadata if available, like enacting authority placeholder
}

async def main():
    # Read the document content
    try:
        with open(sample_doc_path, 'r', encoding='utf-8') as f:
            raw_document_text = f.read()
        print(f"Successfully loaded document: {sample_doc_path}, length: {len(raw_document_text)} chars.")
    except FileNotFoundError:
        print(f"Error: Document file not found at {sample_doc_path}. Please check the path.")
        return

    # Ensure MongoDB is connected before starting processing
    if mongo_manager.db is None: # Check if connection was established
        is_connected = await mongo_manager.connect()
        if not is_connected:
            print("MongoDB connection failed. Cannot proceed.")
            return

    # Clean up existing document entry for a fresh run (optional, for testing)
    # BE CAREFUL WITH THIS IN A PRODUCTION-LIKE SCENARIO
    delete_result = await mongo_manager.documents_collection.delete_one({"document_id": document_id_main})
    if delete_result.deleted_count > 0:
        print(f"Cleaned up existing entry for {document_id_main}.")


    # Start processing
    final_akn_xml = await process_document(
        document_text_content=raw_document_text,
        doc_id=document_id_main,
        doc_source_filename=os.path.basename(sample_doc_path),
        initial_doc_metadata=initial_metadata_main,
        chunk_s=2500, # Adjust as needed; smaller for more granular agent steps initially
        chunk_o=400   # Overlap
    )

    # Close MongoDB connection when all processing is done
    await mongo_manager.close()

    if final_akn_xml:
        print(f"\nProcessing complete for {document_id_main}.")
    else:
        print(f"\nProcessing for {document_id_main} encountered issues or did not produce XML.")

# To run the async main function in Jupyter:
if __name__ == '__main__':
    # This condition is typically for scripts, but in Jupyter, you can just await it
    # For Jupyter, it's better to run like this in a cell:
    # await main()
    # Or, if you are at the top level of a script:
    # import asyncio
    # asyncio.run(main())
    pass

print("To run the processing, execute 'await main()' in a new cell.")

To run the processing, execute 'await main()' in a new cell.


In [12]:
await main()

Successfully loaded document: ../../raw/UU_8_1961.txt, length: 11846 chars.
Attempting to connect to MongoDB: hukum_terbuka_akn...
✅ Connected to MongoDB: hukum_terbuka_akn, collection: legal_documents_akn
Cleaned up existing entry for UU_8_1961_vAkn.
Starting processing for document: UU_8_1961_vAkn
✅ MongoDB connection already active for: hukum_terbuka_akn
Document divided into 6 chunks.

--- Processing Chunk 1/6 (ID: chunk_001) ---
Content (first 100 chars): PRESIDEN
REPUBLIK INDONESIA
UNDANG-UNDANG REPUBLIK INDONESIA
NOMOR 8 TAHUN 1961
TENTANG
WAJIB KERJA ...
----
SYSTEM: 
You are an expert legislative informatics assistant specializing in converting Indonesian legal documents into Akoma Ntoso (AKN) v3.0 XML format.
Your primary goal is to analyze chunks of an Indonesi...
USER: 
Analyze the following chunk of an Indonesian legal document (ID: chunk_001).
The document ID is 'UU_8_1961_vAkn'.
The previously processed Akoma Ntoso element eId was: 'act'.

Your task is to integrate this 