In [66]:
# Basic imports
import json
import os
from datetime import datetime, timezone
import uuid
from typing import List, Dict, Any, Optional

# OpenAI client (or your preferred LLM client)
from openai import AsyncOpenAI

# Weave for tracing (optional, if you're using it)
import weave

# MongoDB
import motor
from motor.motor_asyncio import AsyncIOMotorClient
import motor.motor_asyncio # Make sure this import is at the top of your file
import pymongo # For ObjectId, and some potential sync operations or type hints

# XML Processing
from lxml import etree # Preferred for robust XML/XPath/Namespace handling

# Pydantic for structuring tool inputs and some internal models
from pydantic import BaseModel, Field

print("Imports successful.")

Imports successful.


In [67]:
# Initialize Weave Tracing
weave.init('HukumTerbuka')

<weave.trace.weave_client.WeaveClient at 0x7fd00d9e8440>

In [68]:
# Akoma Ntoso v3.0 Namespace
AKN_NAMESPACE = "http://docs.oasis-open.org/legaldocml/ns/akn/3.0"
NSMAP = {None: AKN_NAMESPACE} # Default namespace for lxml

# --- Indonesian Legal Structure to AKN Mapping (Revised based on UU No.8/1961 example) ---
AKN_STRUCTURE_MAPPING_INFO = """
Akoma Ntoso (AKN) v3.0 Mapping for Indonesian Legal Documents:

**A. General Principles**:
1.  **Document Root**: For Undang-Undang (UU), use `<akomaNtoso><act name="indonesianAct" ...>`.
2.  **`eId` (Element ID) Strategy**:
    * Crucial for all addressable structural elements.
    * Hierarchical and predictable: `containerId__elementTypePrefix_number`.
    * Common prefixes:
        * `meta`, `identification`, `publication`
        * `preamble`, `rec` (recital), `cit` (citation), `formula`
        * `body`
        * `ch` (chapter), `sec` (section - for Bagian), `subsec` (subsection - for Paragraf structure)
        * `art` (article - for Pasal), `para` (paragraph - for Ayat)
        * `list` (for a list within an Ayat/paragraph), `point` (for an item 'a.', 'b.' in a list)
        * `conclusions`, `docGeneration`, `signatureBlock`, `signature`
        * `attachment`, `elucidation`, `elucidationGeneral`, `elucidationArticle`
    * Example: `body__art_1` (Pasal 1), `body__art_1__para_1` (Ayat 1 of Pasal 1), `body__art_1__para_1__list_1__point_a` (Point 'a' in Ayat 1 of Pasal 1).
    * Generate eIds sequentially within their parent. `art_1`, `art_2`, etc. `art_1__para_1`, `art_1__para_2`, etc.

**B. Document Structure Mapping**:

1.  **Metadata (`<meta>`)**:
    * `identification source="#source_for_main_doc"`: Contains FRBR elements.
        * `FRBRWork`: `FRBRthis value="/akn/id/act/YYYY/NN/main"`, `FRBRuri value="/akn/id/act/YYYY/NN"`, `FRBRdate date="YYYY-MM-DD" name="enactment"`, `FRBRauthor href="#author_ref"`, `FRBRcountry value="id"`, `FRBRname value="FULL_TITLE_TENTANG"`, `FRBRnumber value="NN"`, `FRBRprescriptive value="true"`.
        * `FRBRManifestation`: `FRBRthis value="/akn/id/act/YYYY/NN/id@YYYY-MM-DD/main.xml"`, `FRBRuri value="/akn/id/act/YYYY/NN/id@YYYY-MM-DD.xml"`, `FRBRdate date="YYYY-MM-DD" name="publication"`, `FRBRformat value="application/akn+xml"`.
    * `publication name="LembaranNegara" showAs="Lembaran Negara Republik Indonesia" date="YYYY-MM-DD" number="LN_NUMBER" TLA="LNRI" eId="meta__ln"`
    * `lifecycle source="#source_for_main_doc"`: Events like enactment.
    * `references source="#source_for_main_doc"`: Define authorities like `#presiden_ri`, `#dpr_gr`.

2.  **Preamble (`<preamble eId="preamble">`)**:
    * Opening line like "PRESIDEN REPUBLIK INDONESIA,": `<docTitle eId="preamble__doctitle_1">PRESIDEN REPUBLIK INDONESIA,</docTitle>` or similar.
    * `Menimbang:`: `<recitals eId="preamble__recitals_1">`. Each point `a. ...;` becomes a `<recital eId="preamble__recitals_1__rec_a"><num>a.</num><p>bahwa ...;</p></recital>`.
    * `Mengingat:`: `<citations eId="preamble__citations_1">`. Each point `a. ...;` becomes a `<citation eId="preamble__citations_1__cit_a"><num>a.</num><p>Pasal ...;</p></citation>`.
    * `Dengan persetujuan...;`: `<container name="approvalStatement" eId="preamble__approval_1"><p>Dengan persetujuan Dewan Perwakilan Rakyat Gotong Royong;</p></container>`
    * `MEMUTUSKAN :`: `<formula name="decision" eId="preamble__formula_decision"> <p>MEMUTUSKAN :</p> </formula>` (This formula might then contain the enacting clauses).
    * `I. Mencabut...;`: Within or after the decision formula, could be `<container name="revocation" eId="preamble__revocation_1"><num>I.</num><p>Mencabut: Undang-undang ... Lembaran-Negara ...);</p></container>`
    * `II. Menetapkan: UNDANG-UNDANG TENTANG ... .`: `<enactingFormula eId="preamble__enactingformula_1" name="enactingFormula"><num>II.</num><p>Menetapkan: <docType refersTo="#indonesianAct">UNDANG-UNDANG</docType> TENTANG <docTitle refersTo="#mainTitle">WAJIB KERJA SARJANA</docTitle>.</p></enactingFormula>`

3.  **Body (`<body eId="body">`)**:
    * `BAB X JUDUL BAB`: If present, `<chapter eId="body__ch_X" num="X"><num>BAB X</num><heading>JUDUL BAB</heading> ... </chapter>`.
    * `Bagian Kesatu JUDUL BAGIAN`: If present, `<section eId="body__ch_X__sec_Y" num="Y"><num>Bagian Kesatu</num><heading>JUDUL BAGIAN</heading> ... </section>`.
    * `Pasal N`: `<article eId="body__art_N" num="N"><num>Pasal N.</num><content> ... </content></article>`. If a Pasal has a title, use `<heading>Judul Pasal</heading>` after `<num>`.
    * `Ayat (M)`: Within `<article><content>`, use `<paragraph eId="body__art_N__para_M" num="(M)"><num>(M)</num><content><p>Teks ayat...</p></content></paragraph>`.
    * Lists within Ayat (e.g., `a. ...; b. ...;`): `<list eId="body__art_N__para_M__list_1" type="alpha"><point eId="body__art_N__para_M__list_1__point_a"><num>a.</num><content><p>Teks poin a...</p></content></point> <point eId="body__art_N__para_M__list_1__point_b"><num>b.</num><content><p>Teks poin b...</p></content></point></list>`.
    * Concluding sentences of an Ayat after a list are part of the last `<p>` of the list's final point, or a new `<p>` within the Ayat's `<content>` after the `<list>`.

4.  **Concluding Provisions (Usually last Articles in `<body>` or in `<conclusions>`)**:
    * `Pasal X` (Pelaksanaan): `<article eId="body__art_X" num="X" heading="Pelaksanaan"><num>Pasal X.</num><content><p>Pelaksanaan...</p></content></article>`. Could have a `role` attribute.
    * `Pasal Y` (Mulai Berlaku): `<article eId="body__art_Y" num="Y" heading="Ketentuan Peralihan"><num>Pasal Y.</num><content><p>Undang-undang ini mulai berlaku...</p></content></article>`.
    * Promulgation command: `<container name="promulgationCommand" eId="body__promulgation_1"><p>Agar supaya setiap orang dapat mengetahuinya...</p></container>`.

5.  **Signatures & Dates (`<conclusions eId="conclusions">`)**:
    * `<container name="placeDateOfSignature" eId="conclusions__sigblock_1__placedate_1"><p>Disahkan di Jakarta</p><p>pada tanggal 29 April 1961.</p></container>`
    * `<container name="signatureBlock" eId="conclusions__sigblock_1"> <signature eId="conclusions__sigblock_1__signature_1"><role refersTo="#presidenRI">Pejabat Presiden Republik Indonesia,</role><person refersTo="#djuanda">DJUANDA</person></signature> </container>`
    * Similar structure for "Diundangkan di..." and the Sekretaris Negara.

6.  **`PENJELASAN` (Elucidation)**:
    * Attached to the main document: `<attachments eId="attachments_1"> <attachment eId="attachments_1__elucidation_1" type="elucidation"> ... </attachment> </attachments>`.
    * The elucidation itself will have `meta`, `preface` (containing its title), and `body`.
    * `PENJELASAN UMUM.`: `<container name="generalExplanation" eId="attachments_1__elucidation_1__body__general_1"> <heading>PENJELASAN UMUM.</heading> <p>Teks...</p> ... </container>`.
    * `PENJELASAN PASAL DEMI PASAL.`: `<container name="articleByArticleExplanation" eId="attachments_1__elucidation_1__body__artbyart_1"> <heading>PENJELASAN PASAL DEMI PASAL.</heading> ... </container>`.
        * `Pasal 1.`: `<clause eId="attachments_1__elucidation_1__body__artbyart_1__art_1" refersTo="#body__art_1"> <heading>Pasal 1.</heading> <p>Teks penjelasan pasal 1...</p> </clause>`.
        * `Pasal 2 dan 3.`: `<clause eId="attachments_1__elucidation_1__body__artbyart_1__art_2-3" refersTo="#body__art_2 #body__art_3"> <heading>Pasal 2 dan 3.</heading> <p>Teks penjelasan...</p> </clause>`.

**C. Agent Instructions**:
* You MUST generate valid XML.
* For each new element, you MUST generate a unique and hierarchical `eId`.
* When text seems to continue from a previous chunk into an existing element (like a `<p>`), use the `update_akn_element` tool with `append_text_content`.
* When adding a new structural element (like a new `<paragraph>` (Ayat) or `<article>` (Pasal)), use the `add_akn_element` tool.
* Always refer to the existing document structure using `get_akn_document_context` to ensure correct placement and `eId` generation.
* State the `eId` of the element you are primarily working on or have just completed for context logging.
"""

print("Revised Akoma Ntoso configuration loaded.")
# For debugging or inspection:
# print(AKN_STRUCTURE_MAPPING_INFO)

Revised Akoma Ntoso configuration loaded.


In [69]:
# Ensure your API key is set as an environment variable
# OPENROUTER_API_KEY or OPENAI_API_KEY
# For OpenRouter:
client = AsyncOpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),
)
# Or for OpenAI:
# client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Select your preferred model
# LLM_MODEL = "openai/gpt-4o"
# LLM_MODEL = "openai/gpt-4-turbo"
LLM_MODEL = "google/gemini-2.5-flash-preview-05-20" # Example
# LLM_MODEL = "anthropic/claude-3.5-sonnet"

print(f"Using LLM Model: {LLM_MODEL}")

Using LLM Model: google/gemini-2.5-flash-preview-05-20


In [70]:
class MongoDBManager:
    def __init__(self, connection_string: str = "mongodb://localhost:27017/", db_name: str = "hukum_terbuka_akn"):
        self.connection_string = connection_string
        self.db_name = db_name
        self.async_client = None
        self.db = None
        self.documents_collection = None

    async def connect(self):
        """Connect to MongoDB and initialize db and collection attributes."""
        # Corrected condition:
        if (self.async_client is not None and
                self.db is not None and
                self.documents_collection is not None):
            try:
                await self.async_client.admin.command('ping')
                print(f"✅ MongoDB connection already active for: {self.db_name}")
                return True
            except Exception as e: # Catch specific exceptions if possible
                print(f"Stale connection detected (ping failed: {e}), attempting to reconnect.")
                await self.close() # Close existing (stale) client first, this will nullify attributes

        print(f"Attempting to connect to MongoDB: {self.db_name}...")
        try:
            self.async_client = motor.motor_asyncio.AsyncIOMotorClient(self.connection_string, serverSelectionTimeoutMS=5000)
            await self.async_client.admin.command('ping') # Verify connection
            self.db = self.async_client[self.db_name]
            self.documents_collection = self.db["legal_documents_akn"]
            print(f"✅ Connected to MongoDB: {self.db_name}, collection: {self.documents_collection.name}")
            return True
        except Exception as e:
            print(f"❌ MongoDB connection failed: {e}")
            if self.async_client: # pragma: no cover
                self.async_client.close()
            self.async_client = None
            self.db = None
            self.documents_collection = None
            return False

    async def close(self):
        """Close MongoDB connection and reset attributes."""
        if self.async_client:
            self.async_client.close()
            print("MongoDB connection closed.")
        self.async_client = None
        self.db = None
        self.documents_collection = None

# Initialize MongoDB manager
mongo_manager = MongoDBManager()

# Example of how to connect (will be called later in the main workflow)
# await mongo_manager.connect()

In [71]:
# Cell 5: Tool Schemas (Revised)
tools_schemas = [
    {
        "type": "function",
        "function": {
            "name": "initialize_akn_document",
            "description": "Initializes a new Akoma Ntoso document skeleton in the database with basic metadata. Should be called only once for a new document, typically when processing the first chunk.",
            "parameters": {
                "type": "object",
                "properties": {
                    "document_id": {"type": "string", "description": "Unique identifier for the document (e.g., UU_8_1961)."},
                    "source_file": {"type": "string", "description": "Filename of the source text document."},
                    "doc_type_hint": {"type": "string", "description": "Hint for the AKN document type (e.g., 'act', 'bill', 'judgment'). Defaults to 'act'."},
                    "initial_metadata": {
                        "type": "object",
                        "description": "Basic metadata extracted from the first part of the document.",
                        "properties": {
                            "title": {"type": "string", "description": "Official title (e.g., 'WAJIB KERJA SARJANA')."},
                            "number": {"type": "string", "description": "Document number (e.g., '8')."},
                            "year": {"type": "string", "description": "Year of the document (e.g., '1961')."},
                            "doc_subtype": {"type": ["string", "null"], "description": "Subtype like 'UNDANG-UNDANG', 'PERATURAN PEMERINTAH'."},
                            "country_code": {"type": "string", "default": "id", "description": "ISO 3166-1 alpha-2 country code, defaults to 'id'."},
                            "author_placeholder": {"type": ["string", "null"], "description": "Placeholder for authoring authority if known."}
                        },
                        "required": ["title", "number", "year", "doc_subtype"]
                    }
                },
                "required": ["document_id", "source_file", "initial_metadata"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "get_akn_document_context",
            "description": "Retrieves parts of the current AKN XML document to provide context to the agent. Can fetch the full document, a specific element by eId, or its parent and siblings.",
            "parameters": {
                "type": "object",
                "properties": {
                    "document_id": {"type": "string", "description": "Unique identifier for the document."},
                    "target_eId": {"type": ["string", "null"], "description": "Optional eId of the element to focus on. If null, might return high-level structure or last modified area."},
                    "context_type": {
                        "type": "string",
                        "enum": ["full", "element_only", "element_with_children", "parent_and_siblings", "element_children_only"],
                        "default": "element_with_children",
                        "description": "Type of context to retrieve if target_eId is specified. 'element_children_only' returns only direct children of the target_eId."
                    },
                    "max_depth": {"type": "integer", "default": 2, "description": "Max depth of children to retrieve if 'element_with_children' is chosen (not used for 'element_children_only')."}
                },
                "required": ["document_id"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "add_akn_element",
            "description": "Adds a new Akoma Ntoso element (as an XML string) as a child of a specified parent element or as a sibling to another element within the document. Handles <content> tag creation for articles/paragraphs if needed.",
            "parameters": {
                "type": "object",
                "properties": {
                    "document_id": {"type": "string", "description": "Unique identifier for the document."},
                    "parent_eId": {"type": ["string","null"], "description": "The eId of the parent element to which the new element will be appended as a child. Use eId of <act> (e.g. 'act') to add top-level elements like preamble, body. If adding to an article's content, provide article's eId."},
                    "new_element_akn_xml": {"type": "string", "description": "The well-formed AKN XML string for the new element to be added. Ensure it's valid XML and adheres to AKN structure."},
                    "sibling_eId": {"type": ["string", "null"], "description": "Optional eId of an existing sibling element. If provided, 'insert_position' determines placement relative to this sibling."},
                    "insert_position": {
                        "type": "string",
                        "enum": ["append_to_parent", "before_sibling", "after_sibling", "append_to_content_of_parent"],
                        "default": "append_to_parent",
                        "description": "Position to insert. 'append_to_parent' adds as last child. 'before_sibling'/'after_sibling' require sibling_eId. 'append_to_content_of_parent' is for adding to <content> of elements like <article> or <paragraph>; it finds/creates <content> under parent_eId and appends there."
                    }
                },
                "required": ["document_id", "new_element_akn_xml"] # parent_eId becomes more crucial depending on insert_position
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "update_akn_element",
            "description": "Updates an existing Akoma Ntoso element in the document. Can update attributes, text content, or append to text content of a suitable child <p> tag.",
            "parameters": {
                "type": "object",
                "properties": {
                    "document_id": {"type": "string", "description": "Unique identifier for the document."},
                    "target_eId": {"type": "string", "description": "The eId of the AKN element to modify. If appending text and this is a structural element, the tool will try to find the last <p> in its <content>."},
                    "new_attributes": {"type": ["object", "null"], "description": "A dictionary of attribute key-value pairs to set/update. E.g. {\"num\": \"(new)\"}."},
                    "new_text_content": {"type": ["string", "null"], "description": "If provided, REPLACES the direct text content of the element. Child elements are removed."},
                    "append_text_content": {"type": ["string", "null"], "description": "If provided, appends this text. If target_eId is a <p> tag, appends to it. If target_eId is a structural element (e.g. <paragraph>, <article>), it tries to append to the text of the LAST <p> child within its <content> element. If no <p> is found, it may append to the target_eId's direct text if appropriate or return an error."}
                },
                "required": ["document_id", "target_eId"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "get_last_p_eId_in_element",
            "description": "Finds the eId of the last <p> element within a specified container element (typically inside its <content>). Useful for appending text accurately.",
            "parameters": {
                "type": "object",
                "properties": {
                    "document_id": {"type": "string", "description": "Unique identifier for the document."},
                    "container_eId": {"type": "string", "description": "The eId of the container element (e.g., a <paragraph> or <article>) to search within."}
                },
                "required": ["document_id", "container_eId"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "store_chunk_processing_info",
            "description": "Stores information about the processing of a specific chunk, including a summary and the eId of the last AKN element significantly handled in that chunk. THIS MUST BE CALLED AT THE END OF PROCESSING EACH CHUNK.",
            "parameters": {
                "type": "object",
                "properties": {
                    "document_id": {"type": "string", "description": "Unique identifier for the document."},
                    "chunk_id": {"type": "string", "description": "Identifier of the processed chunk."},
                    "analysis_summary": {"type": "string", "description": "A brief summary of what was identified and done in this chunk."},
                    "last_processed_akn_eId": {"type": "string", "description": "The eId of the primary Akoma Ntoso element that was last created or modified in this chunk. This helps in resuming context for the next chunk."},
                    "errors_or_notes": {"type": ["string", "null"], "description": "Any errors encountered or specific notes from the agent for this chunk."}
                },
                "required": ["document_id", "chunk_id", "analysis_summary", "last_processed_akn_eId"]
            }
        }
    }
]
print(f"Defined {len(tools_schemas)} tool schemas.")

Defined 6 tool schemas.


In [72]:
# Cell 6: Tool Implementations (Revised XPath usage)

# --- XML Helper Functions (using lxml) ---
# (parse_xml_string, xml_to_string, find_element_by_eid, find_or_create_content_element - KEEP AS IS from your revised code)
# Ensure AKN_NAMESPACE and NSMAP are defined from Cell 2.

def parse_xml_string(xml_string: str) -> Optional[etree._Element]:
    """Parses an XML string into an lxml Element object."""
    try:
        if not xml_string or not xml_string.strip():
            print("XMLSyntaxError: Input XML string is empty or whitespace.")
            return None
        parser = etree.XMLParser(remove_blank_text=True, strip_cdata=False)
        return etree.fromstring(xml_string.encode('utf-8'), parser=parser)
    except etree.XMLSyntaxError as e:
        print(f"XMLSyntaxError: {e}. Problematic XML: {xml_string[:500]}...")
        return None
    except Exception as e: 
        print(f"Error parsing XML: {e}. XML: {xml_string[:500]}...")
        return None

def xml_to_string(element: etree._Element, pretty_print=True) -> str:
    """Converts an lxml Element object back to a string."""
    if element is None:
        return ""
    return etree.tostring(element, pretty_print=pretty_print, encoding="unicode", xml_declaration=False)

def find_element_by_eid(root: etree._Element, eid: str) -> Optional[etree._Element]:
    """Finds an element by its eId attribute using XPath."""
    if root is None or eid is None:
        return None
    found_elements = root.xpath(f".//*[@eId='{eid}']")
    if found_elements:
        return found_elements[0]
    if root.get("eId") == eid:
        return root
    return None

def find_or_create_content_element(parent_element: etree._Element) -> Optional[etree._Element]:
    """Finds or creates a <content> child element within the given parent."""
    if parent_element is None:
        return None
    content_el = parent_element.find(f"{{{AKN_NAMESPACE}}}content")
    if content_el is None:
        content_el = etree.SubElement(parent_element, etree.QName(AKN_NAMESPACE, "content"))
        parent_eid = parent_element.get("eId")
        # Optional: Give content an eId if desired, e.g.
        # if parent_eid:
        #    content_el.set("eId", f"{parent_eid}__content") 
    return content_el

# --- Tool Implementations ---
TOOL_MAPPING = {}

# initialize_akn_document (KEEP AS IS from your revised code in the previous turn)
async def initialize_akn_document(document_id: str, source_file: str, doc_type_hint: str = "ACT", initial_metadata: dict = None) -> str: # Changed default to ACT
    if mongo_manager.documents_collection is None:
        return "Error: MongoDB not connected or collection not initialized."
    if initial_metadata is None:
        initial_metadata = {}

    existing_doc = await mongo_manager.documents_collection.find_one({"document_id": document_id})
    if existing_doc:
         return f"Error: Document {document_id} already exists. Cannot re-initialize. Delete it first if a fresh start is intended."

    akn_root = etree.Element(etree.QName(AKN_NAMESPACE, "akomaNtoso"), nsmap=NSMAP)
    doc_element_name = doc_type_hint.upper() # Ensure it's uppercase like 'ACT'
    doc_el = etree.SubElement(akn_root, etree.QName(AKN_NAMESPACE, doc_element_name), eId=doc_element_name) 
    doc_el.set("name", "indonesianAct") 

    meta_el = etree.SubElement(doc_el, etree.QName(AKN_NAMESPACE, "meta"), eId=f"{doc_element_name}__meta")
    identification_el = etree.SubElement(meta_el, etree.QName(AKN_NAMESPACE, "identification"), eId=f"{doc_element_name}__meta__identification", source="#sourceSystem")

    frbr_work_el = etree.SubElement(identification_el, etree.QName(AKN_NAMESPACE, "FRBRWork"), eId=f"{doc_element_name}__meta__identification__work")
    year = initial_metadata.get("year", "0000")
    number = initial_metadata.get("number", "0")
    title = initial_metadata.get("title", "Unknown Title")
    country = initial_metadata.get("country_code", "id")
    doc_subtype_for_uri = initial_metadata.get("doc_subtype", doc_element_name.lower()).lower() # Use subtype for URI or fallback
    
    etree.SubElement(frbr_work_el, etree.QName(AKN_NAMESPACE, "FRBRthis"), value=f"/akn/{country}/{doc_subtype_for_uri}/{year}/{number}/main")
    etree.SubElement(frbr_work_el, etree.QName(AKN_NAMESPACE, "FRBRuri"), value=f"/akn/{country}/{doc_subtype_for_uri}/{year}/{number}")
    etree.SubElement(frbr_work_el, etree.QName(AKN_NAMESPACE, "FRBRdate"), date=f"{year}-01-01", name="creation") 
    etree.SubElement(frbr_work_el, etree.QName(AKN_NAMESPACE, "FRBRauthor"), href="#system", as_="#author")
    etree.SubElement(frbr_work_el, etree.QName(AKN_NAMESPACE, "FRBRcountry"), value=country)
    if title:
         etree.SubElement(frbr_work_el, etree.QName(AKN_NAMESPACE, "FRBRname"), value=str(title))
    if number:
         etree.SubElement(frbr_work_el, etree.QName(AKN_NAMESPACE, "FRBRnumber"), value=str(number))

    frbr_manifestation_el = etree.SubElement(identification_el, etree.QName(AKN_NAMESPACE, "FRBRManifestation"), eId=f"{doc_element_name}__meta__identification__manifestation")
    etree.SubElement(frbr_manifestation_el, etree.QName(AKN_NAMESPACE, "FRBRthis"), value=f"/akn/{country}/{doc_subtype_for_uri}/{year}/{number}/id@main.xml") 
    etree.SubElement(frbr_manifestation_el, etree.QName(AKN_NAMESPACE, "FRBRuri"), value=f"/akn/{country}/{doc_subtype_for_uri}/{year}/{number}/id.xml")
    etree.SubElement(frbr_manifestation_el, etree.QName(AKN_NAMESPACE, "FRBRdate"), date=datetime.now(timezone.utc).strftime("%Y-%m-%d"), name="publication")
    etree.SubElement(frbr_manifestation_el, etree.QName(AKN_NAMESPACE, "FRBRformat"), value="application/akn+xml")

    etree.SubElement(doc_el, etree.QName(AKN_NAMESPACE, "preamble"), eId=f"{doc_element_name}__preamble")
    etree.SubElement(doc_el, etree.QName(AKN_NAMESPACE, "body"), eId=f"{doc_element_name}__body")
    etree.SubElement(doc_el, etree.QName(AKN_NAMESPACE, "conclusions"), eId=f"{doc_element_name}__conclusions")
    etree.SubElement(doc_el, etree.QName(AKN_NAMESPACE, "attachments"), eId=f"{doc_element_name}__attachments")
    
    akn_xml_string = xml_to_string(akn_root)

    doc_to_insert = {
        "document_id": document_id,
        "source_file": source_file,
        "document_metadata_extracted": initial_metadata,
        "akn_xml_string": akn_xml_string,
        "processing_state": {
            "current_chunk_id": None,
            "last_processed_akn_eId": doc_element_name, 
            "status": "initialized"
        },
        "chunk_processing_log": [],
        "created_at": datetime.now(timezone.utc),
        "updated_at": datetime.now(timezone.utc)
    }
    try:
        await mongo_manager.documents_collection.insert_one(doc_to_insert)
        return f"Successfully initialized Akoma Ntoso document '{document_id}' in database. Root eId: '{doc_element_name}'."
    except Exception as e:
        return f"Error initializing document '{document_id}': {str(e)}"
TOOL_MAPPING["initialize_akn_document"] = initialize_akn_document

# get_akn_document_context (KEEP AS IS from your revised code in the previous turn)
async def get_akn_document_context(document_id: str, target_eId: Optional[str] = None,
                                   context_type: str = "element_with_children",
                                   max_depth: int = 2) -> str:
    if mongo_manager.documents_collection is None:
        return "Error: MongoDB not connected or collection not initialized."

    doc_data = await mongo_manager.documents_collection.find_one({"document_id": document_id})
    if not doc_data or "akn_xml_string" not in doc_data:
        return f"Error: Document '{document_id}' not found or has no AKN XML."

    akn_xml_string = doc_data["akn_xml_string"]
    root = parse_xml_string(akn_xml_string)
    if root is None:
        return "Error: Could not parse stored AKN XML."

    if context_type == "full":
        return xml_to_string(root)

    if not target_eId:
        # If no target_eId, return summary of the main document element (e.g. ACT)
        main_doc_element = root[0] if len(root) > 0 else root 
        if main_doc_element is not None:
            summary_context = etree.Element(main_doc_element.tag, nsmap=NSMAP, attrib=main_doc_element.attrib)
            for child in main_doc_element:
                if child.tag.endswith("meta") or child.tag.endswith("preamble") or \
                   child.tag.endswith("body") or child.tag.endswith("conclusions") or \
                   child.tag.endswith("attachments"):
                    child_copy = etree.Element(child.tag, nsmap=NSMAP, attrib=child.attrib)
                    # Optionally add 1 level of children for a bit more context
                    # for sub_child in child:
                    #     sub_child_copy = etree.Element(sub_child.tag, nsmap=NSMAP, attrib=sub_child.attrib)
                    #     child_copy.append(sub_child_copy)
                    summary_context.append(child_copy)
            return xml_to_string(summary_context) if len(summary_context) > 0 else xml_to_string(main_doc_element)
        return xml_to_string(root) 

    element_to_focus = find_element_by_eid(root, target_eId)
    if element_to_focus is None:
        return f"Error: Element with eId '{target_eId}' not found in document '{document_id}'. Check eId or use a known parent."

    if context_type == "element_only":
        return xml_to_string(element_to_focus)
    
    elif context_type == "element_with_children":
        return xml_to_string(element_to_focus) # Returns element and its full subtree
        
    elif context_type == "element_children_only":
        # Create a temporary wrapper to hold copies of children
        children_wrapper = etree.Element(etree.QName(AKN_NAMESPACE, "childrenContext"), nsmap=NSMAP)
        for child in element_to_focus:
            # Deepcopy each child to avoid modifying the original tree if the snippet is altered
            children_wrapper.append(etree.fromstring(etree.tostring(child))) 
        return xml_to_string(children_wrapper)

    elif context_type == "parent_and_siblings":
        parent = element_to_focus.getparent()
        if parent is None: 
            return xml_to_string(element_to_focus) # Target is root or has no parent in this view
        
        parent_copy_for_context = etree.fromstring(etree.tostring(parent))
        return xml_to_string(parent_copy_for_context)

    return f"Error: Invalid context_type '{context_type}' or other issue with target_eId."
TOOL_MAPPING["get_akn_document_context"] = get_akn_document_context

# add_akn_element (KEEP AS IS from your revised code in the previous turn)
async def add_akn_element(document_id: str, new_element_akn_xml: str,
                          parent_eId: Optional[str] = None,
                          sibling_eId: Optional[str] = None,
                          insert_position: str = "append_to_parent") -> str:
    if mongo_manager.documents_collection is None:
        return "Error: MongoDB not connected or collection not initialized."

    doc_data = await mongo_manager.documents_collection.find_one({"document_id": document_id})
    if not doc_data or "akn_xml_string" not in doc_data:
        return f"Error: Document '{document_id}' not found."

    current_xml_str = doc_data["akn_xml_string"]
    root = parse_xml_string(current_xml_str)
    if root is None: return "Error: Could not parse stored AKN XML."

    try:
        new_element = parse_xml_string(new_element_akn_xml)
        if new_element is None: return f"Error: Could not parse new_element_akn_xml: {new_element_akn_xml[:100]}"
    except Exception as e:
        return f"Error parsing new_element_akn_xml: {str(e)}. Snippet: {new_element_akn_xml[:100]}"

    new_element_eId_attr = new_element.get("eId")
    if not new_element_eId_attr:
        # Allow adding elements without eId if they are simple like <p> inside a complex structure being added all at once.
        # However, top-level structural elements added individually should have eIds.
        # For now, this check is for elements directly added by this tool call.
        # If the agent is providing a large XML blob with many nested elements, only the top one needs an eId here.
        # This logic might need refinement based on how the agent uses the tool.
        # A simpler rule: if the `new_element_akn_xml` represents a single new block, it should have an eId.
        # If it's a <p> tag, it MIGHT not need an eId if its parent structure provides uniqueness.
        # Let's assume for now, direct adds of structural items need an eId.
        if etree.QName(new_element.tag).localname not in ['p', 'num', 'heading']: # Allow p, num, heading without eId if part of larger add
             return f"Error: New element XML is missing an 'eId' attribute for structural element. XML: {new_element_akn_xml[:200]}"
    elif find_element_by_eid(root, new_element_eId_attr) is not None:
        return f"Error: Element with eId '{new_element_eId_attr}' already exists in the document. Cannot add duplicate."

    target_parent_for_ops = None # This will be the element to which the new_element is appended or related

    if insert_position == "append_to_parent" or insert_position == "append_to_content_of_parent":
        if not parent_eId: return f"Error: parent_eId is required for '{insert_position}'."
        
        target_parent_for_ops = find_element_by_eid(root, parent_eId)
        if target_parent_for_ops is None: return f"Error: Parent element with eId '{parent_eId}' not found."

        if insert_position == "append_to_content_of_parent":
            parent_tag_name = etree.QName(target_parent_for_ops.tag).localname
            # Common AKN elements that typically wrap their main textual/block content in a <content> tag
            content_holder_tags = ["article", "paragraph", "clause", "recital", "citation", 
                                   "speech", "question", "answer", "other", "scene", "point", "item",
                                   "chapter", "section", "subsection", "alinea"] # Added more
            if parent_tag_name in content_holder_tags:
                content_element = find_or_create_content_element(target_parent_for_ops)
                if content_element is None: 
                    return f"Error: Could not find or create <content> in parent eId '{parent_eId}'."
                content_element.append(new_element)
            else: # If not a typical content holder, or if agent wants direct append.
                target_parent_for_ops.append(new_element)
        else: # append_to_parent
            target_parent_for_ops.append(new_element)

    elif insert_position in ["before_sibling", "after_sibling"]:
        if not sibling_eId: return f"Error: sibling_eId is required for '{insert_position}'."
        # Parent_eId is the parent of the sibling, crucial for context.
        if not parent_eId: return f"Error: parent_eId (of the sibling) is required for '{insert_position}'."

        actual_parent_of_sibling = find_element_by_eid(root, parent_eId)
        if actual_parent_of_sibling is None: return f"Error: Specified parent element '{parent_eId}' (for sibling) not found."
        
        target_sibling = find_element_by_eid(actual_parent_of_sibling, sibling_eId) 
        if target_sibling is None: return f"Error: Sibling element with eId '{sibling_eId}' not found under parent '{parent_eId}'."
        
        if target_sibling.getparent() is None or target_sibling.getparent() != actual_parent_of_sibling :
             return f"Error: Sibling element '{sibling_eId}' is not a direct child of parent '{parent_eId}'."

        if insert_position == "before_sibling":
            target_sibling.addprevious(new_element)
        else: 
            target_sibling.addnext(new_element)
    else:
        return f"Error: Invalid insert_position '{insert_position}'."

    updated_xml_str = xml_to_string(root)
    try:
        await mongo_manager.documents_collection.update_one(
            {"document_id": document_id},
            {"$set": {"akn_xml_string": updated_xml_str, "updated_at": datetime.now(timezone.utc)}}
        )
        # Use new_element_eId_attr if available, otherwise the tag name for confirmation
        confirmation_id = new_element_eId_attr if new_element_eId_attr else etree.QName(new_element.tag).localname
        return f"Successfully added element '{confirmation_id}' to document '{document_id}'."
    except Exception as e:
        return f"Error updating document in DB after adding element: {str(e)}"
TOOL_MAPPING["add_akn_element"] = add_akn_element

# update_akn_element (Revised XPath)
async def update_akn_element(document_id: str, target_eId: str,
                             new_attributes: Optional[dict] = None,
                             new_text_content: Optional[str] = None,
                             append_text_content: Optional[str] = None) -> str:
    if mongo_manager.documents_collection is None:
        return "Error: MongoDB not connected or collection not initialized."

    doc_data = await mongo_manager.documents_collection.find_one({"document_id": document_id})
    if not doc_data or "akn_xml_string" not in doc_data:
        return f"Error: Document '{document_id}' not found."

    current_xml_str = doc_data["akn_xml_string"]
    root = parse_xml_string(current_xml_str)
    if root is None: return "Error: Could not parse stored AKN XML."

    target_element = find_element_by_eid(root, target_eId)
    if target_element is None: return f"Error: Element with eId '{target_eId}' not found."

    updated = False
    if new_attributes:
        for key, value in new_attributes.items():
            target_element.set(key, str(value))
        updated = True

    if new_text_content is not None:
        # Clear existing children and direct text before setting new text
        target_element.text = None # Clear direct text
        for child in list(target_element): # Iterate over a copy for safe removal
            target_element.remove(child)
        
        # The new_text_content is usually wrapped in a <p> by the agent if it's block text.
        # If the agent provides raw text for an element that should contain <p>, this logic might need adjustment.
        # For now, assume new_text_content is the direct text for the target_element if it's a simple text node,
        # or the agent provides structured XML (like a <p>) if target_element is a container.
        # The prompt should guide the agent to provide <p> wrapped text for content holders.
        # A safer way if new_text_content is meant to be the *sole* content for e.g. a paragraph:
        # Create a <p> element, set its text, and append it to a <content> element.
        # However, current schema says "replaces the direct text content".

        # Let's assume agent provides text for a <p> or similar, or for a structural element it intends to make simple text.
        # If it's for a structural element and meant to be paragraph content, agent should provide `<p>text</p>` as new_text_content.
        # A simple `target_element.text = new_text_content` is okay if `target_element` itself is the text carrier (e.g. <p>, <num>).
        # If target_element is structural (e.g. <paragraph eid="X">), setting its text directly is usually wrong.
        # The tool should ideally parse new_text_content if it's XML or place it correctly.

        # Current implementation of update_akn_element as per schema, `new_text_content` replaces direct text.
        # If target_eId is e.g. a <paragraph>, and agent passes "New text", the XML becomes <paragraph>New text</paragraph> (children removed).
        # This is often NOT what is desired for structural elements.
        # The agent should provide XML like "<content><p>New text</p></content>" as `new_element_akn_xml` to `add_akn_element` (by first deleting the old content if replacement is intended)
        # OR, `update_akn_element` needs a parameter like `replace_children_with_xml_string`.
        # For now, adhering to "replaces the direct text content":
        target_element.text = new_text_content
        target_element.tail = None
        updated = True


    if append_text_content:
        element_to_append_to = target_element
        target_tag_name = etree.QName(target_element.tag).localname
        
        # If target is structural, try to find the last <p> in its <content> or directly.
        if target_tag_name not in ["p", "heading", "num", "td", "th", "caption"]: # e.g. article, paragraph
            # Try to find <content> then last <p> inside it
            content_el = target_element.find(f"{{{AKN_NAMESPACE}}}content")
            if content_el is not None:
                # Corrected XPath with namespace map
                last_p_elements = content_el.xpath(f".//akn:p[last()]", namespaces={'akn': AKN_NAMESPACE})
                if last_p_elements:
                    element_to_append_to = last_p_elements[0]
                else: # No <p> in <content>, append to <content>'s text? Or create <p>?
                      # For now, if no <p>, this logic doesn't change element_to_append_to from target_element
                      # Agent should ensure <p> exists or use `get_last_p_eId_in_element`
                      pass 
            else: # No <content> element, try to find last <p> directly in target_element
                last_p_elements = target_element.xpath(f"./akn:p[last()] | .//akn:p[last()]", namespaces={'akn': AKN_NAMESPACE}) # Check direct children first, then descendants
                if last_p_elements:
                    element_to_append_to = last_p_elements[0]
        
        # Append text to the determined element
        if element_to_append_to.text:
            element_to_append_to.text += append_text_content
        else:
            element_to_append_to.text = append_text_content
        updated = True
    
    if not updated and not new_attributes and new_text_content is None and append_text_content is None:
        return f"Info: No update operation specified for element '{target_eId}'."

    updated_xml_str = xml_to_string(root)
    try:
        await mongo_manager.documents_collection.update_one(
            {"document_id": document_id},
            {"$set": {"akn_xml_string": updated_xml_str, "updated_at": datetime.now(timezone.utc)}}
        )
        return f"Successfully updated element '{target_eId}' in document '{document_id}'."
    except Exception as e:
        return f"Error updating document '{document_id}' in DB after updating element: {str(e)}"
TOOL_MAPPING["update_akn_element"] = update_akn_element

# get_last_p_eId_in_element (Revised XPath)
async def get_last_p_eId_in_element(document_id: str, container_eId: str) -> str:
    if mongo_manager.documents_collection is None:
        return "Error: MongoDB not connected or collection not initialized."

    doc_data = await mongo_manager.documents_collection.find_one({"document_id": document_id})
    if not doc_data or "akn_xml_string" not in doc_data:
        return f"Error: Document '{document_id}' not found or has no AKN XML."

    akn_xml_string = doc_data["akn_xml_string"]
    root = parse_xml_string(akn_xml_string)
    if root is None:
        return "Error: Could not parse stored AKN XML."

    container_element = find_element_by_eid(root, container_eId)
    if container_element is None:
        return f"Error: Container element with eId '{container_eId}' not found."

    # Corrected XPath: Find <p> descendants of the container_element and take the last one.
    # Explicitly use the namespace.
    try:
        # Search within <content> if it exists, otherwise directly in container
        content_el = container_element.find(f"{{{AKN_NAMESPACE}}}content")
        search_context = content_el if content_el is not None else container_element
        
        # Find the last <p> element that is a descendant of search_context
        last_p_elements = search_context.xpath(".//akn:p[last()]", namespaces={'akn': AKN_NAMESPACE})
    except etree.XPathEvalError as e:
        return f"Error: XPath evaluation error in get_last_p_eId_in_element for '{container_eId}': {e}"

    if last_p_elements:
        last_p_eId = last_p_elements[0].get("eId")
        if last_p_eId:
            return f"Success: eId of last <p> in '{container_eId}' is '{last_p_eId}'."
        else:
            # If last <p> has no eId, we can't return an eId.
            # Agent might need to add an eId to it first or target the container for append.
            p_text_preview = (last_p_elements[0].text or "")[:30]
            return f"Info: Last <p> element in '{container_eId}' found, but it has no eId. Preview: '{p_text_preview}...'"
    else:
        return f"Info: No <p> elements found within container '{container_eId}' (or its <content> child)."
TOOL_MAPPING["get_last_p_eId_in_element"] = get_last_p_eId_in_element

# store_chunk_processing_info (KEEP AS IS from your revised code)
async def store_chunk_processing_info(document_id: str, chunk_id: str,
                                      analysis_summary: str, last_processed_akn_eId: str,
                                      errors_or_notes: Optional[str] = None) -> str:
    if mongo_manager.documents_collection is None:
        return "Error: MongoDB not connected or collection not initialized."

    log_entry = {
        "chunk_id": chunk_id,
        "analysis_summary": analysis_summary,
        "last_processed_akn_eId_at_end_of_chunk": last_processed_akn_eId,
        "errors_or_notes": errors_or_notes,
        "timestamp": datetime.now(timezone.utc)
    }
    
    update_query = {
        "$push": {"chunk_processing_log": log_entry},
        "$set": {
            "processing_state.current_chunk_id": chunk_id,
            "processing_state.last_processed_akn_eId": last_processed_akn_eId,
            "processing_state.status": "processing", 
            "updated_at": datetime.now(timezone.utc)
        }
    }
    doc_exists_count = await mongo_manager.documents_collection.count_documents({"document_id": document_id}, limit=1)
    if doc_exists_count == 0:
        return f"Error: Document '{document_id}' not found. Cannot store chunk info. Was it initialized?"
        
    try:
        result = await mongo_manager.documents_collection.update_one(
            {"document_id": document_id},
            update_query
        )
        if result.matched_count == 0: 
            return f"Error: Document '{document_id}' not found for storing chunk info (race condition?)."
        # modified_count can be 0 if only $push happened to an existing array field and no $set fields changed value
        # So, successful match is a better indicator here for $push.
        return f"Successfully stored processing info for chunk '{chunk_id}' of document '{document_id}'. Last eId set to '{last_processed_akn_eId}'."
    except Exception as e:
        return f"Error storing chunk processing info for '{document_id}': {str(e)}"
TOOL_MAPPING["store_chunk_processing_info"] = store_chunk_processing_info

print(f"Defined {len(TOOL_MAPPING)} tool implementations with revisions.")

Defined 6 tool implementations with revisions.


In [73]:
# Cell 7: Agent System Prompt (Revised)

# Ensure AKN_STRUCTURE_MAPPING_INFO is defined from Cell 2.

agent_system_prompt = f"""
You are an expert legislative informatics assistant specializing in converting Indonesian legal documents into Akoma Ntoso (AKN) v3.0 XML format.
Your primary goal is to analyze chunks of an Indonesian legal text and incrementally construct a well-formed and semantically rich Akoma Ntoso XML document.
You MUST strictly adhere to Akoma Ntoso structure and use the provided mappings and eId strategy.
The document will be built incrementally. Always consider the current state of the AKN XML document and the context of the last processed element.

### New Emphasis: Textual Accuracy and Correction
**Crucial Task - Textual Fidelity:** Before incorporating text into AKN elements, you MUST meticulously review it for OCR errors, spelling mistakes, grammatical inaccuracies, and awkward phrasing carried over from the source.
* Correct common OCR errors (e.g., "setiaporang" to "setiap orang", "Pemerintah mengalami kekurangan akan tenaga sarjana" to "Pemerintah mengalami kekurangan tenaga sarjana").
* Ensure proper Indonesian grammar and spelling (EYD/PUEBI).
* Resolve run-on words or incorrectly split words.
* Identify and reconcile near-duplicate text segments that might arise from OCR or formatting issues in the source, especially when appending text.
* When you make a correction, briefly note the correction in your reasoning before calling the tool. For example: "Corrected 'pasal5ayat' to 'pasal 5 ayat'. Incorporating corrected text into a new <p> element."
* The goal is clean, accurate, and readable text within the XML, while preserving the original legal meaning.

**Core Task:**
For each chunk of text provided:
1.  Analyze the text to identify its structural and semantic meaning. ### Consider the potential for variations from the standard structure.
2.  Determine how this text fits into the existing Akoma Ntoso XML document structure. ### Pay attention to the specific type of Indonesian legal document if discernible (e.g., UU, PP, Perpres) as structures might vary.
3.  Construct appropriate Akoma Ntoso XML snippet(s). Generated XML snippets MUST be well-formed and have unique `eId` attributes for new elements.
4.  Use the available tools strategically:
    * **`initialize_akn_document`**:
        * Call ONLY for the very first chunk if the document isn't initialized.
        * This tool creates the main document element (e.g., `<ACT eId="ACT">`) AND its primary children: `<meta eId="ACT__meta">`, `<preamble eId="ACT__preamble">`, `<body eId="ACT__body">`, `<conclusions eId="ACT__conclusions">`, `<attachments eId="ACT__attachments">`.
        * **IMPORTANT**: After initialization, you MUST use these specific `eId`s as `parent_eId` when adding content to these main sections.
    * **`get_akn_document_context`**: CRITICAL. Before adding or updating any element, and especially when unsure about the current structure or where new content fits:
        * Fetch context of the `last_processed_akn_eId` or its likely parent.
        * Use it to verify the existence of target parent elements (e.g., `ACT__body`, `ACT__preamble__recitals_1`, an elucidation's `articleByArticleExplanation` container).
        * Use it to see the current children of an element before adding new siblings, to determine correct numbering or placement.
        * Use it to inspect the text of the last `<p>` if you are about to append, to ensure a smooth continuation and apply corrections.
    * **`add_akn_element`**:
        * For adding new structural elements (Pasal, Ayat, new list items, new containers in preamble/conclusions/elucidation).
        * When adding an element like `<paragraph>` (Ayat) to an `<article>`, use `parent_eId` of the `<article>` (e.g., `body__art_1`) and `insert_position="append_to_content_of_parent"`. The tool handles finding/creating the `<content>` sub-element.
        * If adding a simple `<p>` text directly into a `<content>` block (e.g., within an existing Ayat that has multiple paragraphs), ensure the parent is the `<content>` element or use `append_to_content_of_parent` with the Ayat's eId.
        * Ensure the `new_element_akn_xml` has a unique `eId` if it's a new structural block.
    * **`update_akn_element`**:
        * For modifying attributes or replacing text of an existing element.
        * For `append_text_content`:
            * This is for continuing text within an *existing* paragraph.
            * First, use `get_akn_document_context` to view the target element and its last `<p>` (if applicable).
            * Perform any necessary text corrections on the `append_text_content` itself *before* calling the tool.
            * If the target is a structural element (e.g., `<paragraph eId="X">`), the tool attempts to append to the last `<p>` in its `<content>`. Ensure this is the desired behavior. If you intend to create a *new, distinct paragraph* after the existing one, you should use `add_akn_element` to add a new `<p>` or a new `<paragraph>` structure.
    * **`get_last_p_eId_in_element`**: Use this to find the `eId` of the last `<p>` tag within a container when you specifically need to target that `<p>` for an update, though often `update_akn_element` with `append_text_content` on the container eId is sufficient.
5.  **MANDATORY FINAL STEP FOR EACH CHUNK:** After all operations for the current chunk are complete, you MUST call `store_chunk_processing_info`. This is essential for maintaining context for the next chunk. Provide an accurate `last_processed_akn_eId`.

**Akoma Ntoso Mapping and `eId` Strategy Details:**
{AKN_STRUCTURE_MAPPING_INFO} 
### Handling Structural Variations:
* **Preamble:** Indonesian laws can vary. While "Menimbang" and "Mengingat" are common, some might have only one, or additional specific containers (e.g., "Dengan rahmat Tuhan Yang Maha Esa", specific approval clauses). Use `get_akn_document_context` on `ACT__preamble` and add new preamble components as distinct `<container>` elements with appropriate `name` attributes (e.g., `name="divineGuidance"`, `name="furtherApproval"`) or mapped AKN elements like `<recitals>` / `<citations>`. Ensure `eId`s are hierarchical (e.g., `ACT__preamble__divineGuidance_1`).
* **Body:** The core structure is Chapters (`<chapter>`), Sections (`<section>`), Articles (`<article>`), Paragraphs (`<paragraph>` for Ayat). Be vigilant for documents that might omit chapters or sections. Always add articles directly to `ACT__body` if no intermediate chapter/section is identified for the current article.
* **Elucidations (`PENJELASAN`):** These are typically in `<attachments>`. An elucidation can have a general part (`<container name="generalExplanation">`) and an article-by-article part (`<container name="articleByArticleExplanation">`). Sometimes only one part is present. Create the necessary containers within the elucidation's `body` as needed. Clauses in "articleByArticleExplanation" should use `<clause refersTo="#body__art_N">`.

**Workflow Example (Conceptual for adding content to `body` after initialization):**
* User provides Chunk 1. Document state: `last_processed_akn_eId="ACT"` (from `initialize_akn_document`).
* You (Agent):
    1. "Analyzing Chunk 1. Text starts with 'Pasal 1...'. Document ID is 'doc_X', previous eId 'ACT'."
    2. "Text quality check: Corrected 'Pasal1' to 'Pasal 1'. The content seems to be a standard article."
    3. "Since 'ACT' is the root, I need to add this Pasal to the main body. The initialized body eId is 'ACT__body'."
    4. "I will construct the AKN for Pasal 1: `<article eId=\\\\"ACT__body__art_1\\\\"><num>Pasal 1.</num><content><p>Corrected text...</p></content></article>`."
    5. "I will call `add_akn_element` with `parent_eId='ACT__body'`, `new_element_akn_xml` for the article, `insert_position='append_to_parent'`."
    6. (Tool confirms success)
    7. "The main element processed is `ACT__body__art_1`."
    8. "Calling `store_chunk_processing_info` with `last_processed_akn_eId='ACT__body__art_1'`."

**Important Rules for Interaction:**
* **eIds:** Generate them carefully, hierarchically, and ensure they are unique.
* **Error Handling:** If a tool call returns an error, analyze the error message. Do NOT blindly retry. Use `get_akn_document_context` to re-verify the current structure. For "Element not found", double-check your `target_eId` and `parent_eId`.
* **XML Validity:** Ensure any XML you provide to tools is well-formed.
* **Reasoning:** Clearly articulate your analysis of the chunk, any text corrections made, your plan for AKN structuring, and the specific tool calls you intend to make with their parameters.

Your response should primarily be a sequence of tool calls. Explain your reasoning briefly but clearly.
"""
print("Agent system prompt defined.")

Agent system prompt defined.


In [74]:
def simple_chunk_text(text: str, chunk_size: int = 2000, overlap: int = 500) -> List[Dict[str, Any]]:
    """
    Simple chunking function that splits text every chunk_size characters
    with optional overlap to maintain context.
    """
    chunks_data = []
    start = 0
    chunk_id_counter = 1

    while start < len(text):
        end = start + chunk_size
        chunk_text = text[start:end]

        chunks_data.append({
            "chunk_id": f"chunk_{chunk_id_counter:03d}", # Consistent chunk ID
            "start_pos": start,
            "end_pos": min(end, len(text)),
            "content": chunk_text,
            "char_count": len(chunk_text)
        })

        # Move start position, accounting for overlap
        # Ensure start doesn't go backward if overlap is larger than (end - start) for last chunk
        if end < len(text):
            start = end - overlap
            if start < (min(end, len(text)) - chunk_size + overlap): # safety if chunk was smaller
                 start = min(end, len(text)) - chunk_size + overlap
            if start < 0 : start = 0 # should not happen with positive overlap
        else:
            start = len(text)

        chunk_id_counter += 1

        # Safety break to avoid infinite loops, e.g., if overlap logic is flawed
        if chunk_id_counter > 2000:  # Adjust as needed, depends on document size
            print("Warning: Exceeded maximum chunk iterations (2000). Breaking.")
            break
    return chunks_data

print("Chunking function defined.")

Chunking function defined.


In [75]:
# Cell 9: Agent loop function (Revised)
async def run_agentic_loop(
    system_prompt_content: str,
    user_prompt_content: str,
    tools_list: list, # List of tool schemas
    tool_function_mapping: dict, # Mapping of tool name to async function
    llm_model_name: str,
    document_id_for_loop: str, # Pass document_id for logging/final tool call check
    chunk_id_for_loop: str,    # Pass chunk_id for logging/final tool call check
    max_iterations: int = 25 # Increased default, tune as needed
):
    messages = [
        {"role": "system", "content": system_prompt_content},
        {"role": "user", "content": user_prompt_content}
    ]
    print(f"----\nSYSTEM: {system_prompt_content[:300]}... (see cell 7 for full prompt)\nUSER: {user_prompt_content[:400]}...\n----")

    assistant_message_content = None # To store the last textual response from assistant

    for i in range(max_iterations):
        print(f"\nIteration {i+1}/{max_iterations} for Doc: {document_id_for_loop}, Chunk: {chunk_id_for_loop}")
        
        # Reset for this iteration
        assistant_message_content = None 
        
        response = await client.chat.completions.create(
            model=llm_model_name,
            messages=messages,
            tools=tools_list,
            tool_choice="auto", 
        )

        message = response.choices[0].message
        messages.append(message.model_dump(exclude_none=True)) 

        if message.content: # Store any textual content from assistant
            assistant_message_content = message.content
            print(f"ASSISTANT (Textual Content): {assistant_message_content}")


        if message.tool_calls:
            print(f"ASSISTANT (Tool Calls):")
            all_tool_calls_processed = True
            for tool_call in message.tool_calls:
                tool_name = tool_call.function.name
                tool_args_str = tool_call.function.arguments
                print(f"  - Calling tool: {tool_name}")
                print(f"    Arguments: {tool_args_str}")

                if tool_name not in tool_function_mapping:
                    tool_result_content = f"Error: Tool '{tool_name}' not found."
                    print(f"    Error: Tool '{tool_name}' not found.")
                    all_tool_calls_processed = False # Mark if any tool call fails this way
                else:
                    try:
                        tool_args = json.loads(tool_args_str)
                        tool_function = tool_function_mapping[tool_name]
                        tool_result_content = await tool_function(**tool_args)
                    except json.JSONDecodeError:
                        tool_result_content = f"Error: Invalid JSON in arguments for {tool_name}: {tool_args_str}"
                        print(f"    Error: Invalid JSON arguments for {tool_name}.")
                        all_tool_calls_processed = False
                    except Exception as e:
                        tool_result_content = f"Error executing tool {tool_name}: {str(e)}"
                        print(f"    Error executing tool {tool_name}: {e}")
                        all_tool_calls_processed = False
                
                print(f"    Tool Result ({tool_name}): {str(tool_result_content)[:300]}...")
                messages.append({
                    "role": "tool",
                    "tool_call_id": tool_call.id,
                    "name": tool_name,
                    "content": str(tool_result_content) 
                })
                
                # If store_chunk_processing_info is successful, it often means the agent is done with this chunk.
                if tool_name == "store_chunk_processing_info" and "Success" in str(tool_result_content):
                    print(f"Agent called store_chunk_processing_info successfully. Ending iteration for chunk {chunk_id_for_loop}.")
                    return f"store_chunk_processing_info called. Details: {tool_result_content}"

            if not all_tool_calls_processed:
                print(f"Warning: One or more tool calls failed in iteration {i+1} for chunk {chunk_id_for_loop}.")
                # Decide if you want to return an error or let the loop continue for the agent to retry/recover
                # For now, we continue, agent should see the error in message history.

        else: # No tool calls, agent provides a direct textual response
            if assistant_message_content:
                 print(f"ASSISTANT (Final Response for chunk {chunk_id_for_loop}, no tool call): {assistant_message_content}")
                 # If the agent gives a text response without calling store_chunk_processing_info, it might be an issue.
                 # However, we can't force it here without complex logic. Rely on prompt and agent training.
                 return assistant_message_content
            else:
                 # This case should be rare if the model is functioning correctly (e.g. it might provide empty content if truly done)
                 print(f"Warning: No tool calls and no textual content from assistant in iteration {i+1} for chunk {chunk_id_for_loop}.")
                 return "No tool calls and no textual content from assistant. Agent may be stuck or finished unexpectedly."


    # Max iterations reached
    last_tool_message = messages[-1] if messages and messages[-1]["role"] == "tool" else None
    if last_tool_message and last_tool_message["name"] == "store_chunk_processing_info" and "Success" in str(last_tool_message["content"]):
        return f"Max iterations reached, but store_chunk_processing_info was successfully called. Result: {last_tool_message['content']}"
    
    warning_message = (
        f"Max iterations ({max_iterations}) reached for Doc: {document_id_for_loop}, Chunk: {chunk_id_for_loop}. "
        "The agent may not have completed all its tasks for this chunk or "
        "failed to call 'store_chunk_processing_info' as the final step."
    )
    print(f"WARNING: {warning_message}")
    
    # Try to provide the last piece of content from the assistant if available
    if assistant_message_content:
        return f"{warning_message} Last assistant text: {assistant_message_content}"
    return warning_message

print("Agent loop function defined.")

Agent loop function defined.


In [76]:
# Cell 10: Main Workflow Orchestration (Revised)
async def process_document(
    document_text_content: str,
    doc_id: str,
    doc_source_filename: str,
    initial_doc_metadata: dict,
    chunk_s: int = 2500, 
    chunk_o: int = 500,
    max_agent_iterations_per_chunk: int = 25 # Tunable: iterations for agent per chunk
):
    print(f"Starting processing for document: {doc_id}")

    if not await mongo_manager.connect(): # connect returns True on success, False on failure
        print("Failed to connect to MongoDB. Aborting.")
        return None

    text_chunks = simple_chunk_text(document_text_content, chunk_size=chunk_s, overlap=chunk_o)
    print(f"Document divided into {len(text_chunks)} chunks.")

    # Fetch initial processing state or set a default
    # The agent is responsible for calling initialize_akn_document if needed.
    # The prompt guides it to do so for chunk_001.
    # We get the state before processing each chunk.
    
    for i, chunk_data in enumerate(text_chunks):
        print(f"\n--- Processing Chunk {i+1}/{len(text_chunks)} (ID: {chunk_data['chunk_id']}) ---")
        print(f"Content (first 100 chars): {chunk_data['content'][:100]}...")

        # Get current processing state from DB before each chunk
        current_processing_state = {"last_processed_akn_eId": "act"} # Default if no state found
        doc_db_state = await mongo_manager.documents_collection.find_one(
            {"document_id": doc_id}, {"processing_state": 1, "_id": 0}
        )
        if doc_db_state and "processing_state" in doc_db_state:
            current_processing_state = doc_db_state["processing_state"]
            print(f"Retrieved processing state from DB: Last eId '{current_processing_state.get('last_processed_akn_eId', 'act')}'")
        else:
            print(f"No prior processing state found for {doc_id} in DB, or document not initialized. Agent will need to initialize if it's chunk 1.")


        user_prompt = f"""
Analyze the following chunk of an Indonesian legal document (Chunk ID: {chunk_data['chunk_id']}).
The document ID is '{doc_id}'.
The previously processed Akoma Ntoso element eId (from the end of the last successfully processed chunk) was: '{current_processing_state.get("last_processed_akn_eId", "act")}'.

Your task is to integrate this chunk's content into the Akoma Ntoso XML document.
Use tools to fetch context, add/update elements, and log your progress.
If this is the very first chunk processed for this document (i.e., no substantial AKN structure exists beyond a basic shell or it's chunk_001 and the document isn't fully initialized according to `get_akn_document_context`), you MUST call `initialize_akn_document` first, but only if it hasn't been successfully initialized before.
If `initialize_akn_document` has already been run (check context or if elements like 'act__body' exist), do NOT call it again.

Chunk Content:
---
{chunk_data['content']}
---
REMEMBER: You MUST call `store_chunk_processing_info` as the very last action for this chunk, providing an accurate `last_processed_akn_eId`.
Determine the correct AKN structure and eIds based on the system guidelines and the content. Be careful with parent eIds and insertion positions.
If a tool call fails, analyze the error and try to recover or adjust your strategy. Do not repeat the exact failing command without change.
"""

        agent_response_or_status = await run_agentic_loop(
            system_prompt_content=agent_system_prompt, # From Cell 7
            user_prompt_content=user_prompt,
            tools_list=tools_schemas, # From Cell 5
            tool_function_mapping=TOOL_MAPPING, # From Cell 6
            llm_model_name=LLM_MODEL, # From Cell 3
            document_id_for_loop=doc_id,
            chunk_id_for_loop=chunk_data['chunk_id'],
            max_iterations=max_agent_iterations_per_chunk 
        )
        print(f"Agent interaction for chunk {chunk_data['chunk_id']} ended. Status/Response: {str(agent_response_or_status)[:300]}...")
        
        # State is re-fetched at the start of the next loop iteration.
        # If store_chunk_processing_info failed or was missed, the next chunk will use the last known good state.

    print(f"\n--- Document processing for {doc_id} complete (all chunks initiated). ---")

    final_doc_data = await mongo_manager.documents_collection.find_one({"document_id": doc_id})
    if final_doc_data and "akn_xml_string" in final_doc_data:
        print("\nFinal Akoma Ntoso XML (first 1000 chars):")
        print(final_doc_data["akn_xml_string"][:1000])
        output_filename = f"{doc_id}_final_revised.akn.xml" # Changed filename
        with open(output_filename, "w", encoding="utf-8") as f:
            f.write(final_doc_data["akn_xml_string"])
        print(f"Full Akoma Ntoso XML saved to: {output_filename}")
        return final_doc_data["akn_xml_string"]
    else:
        print(f"Error: Could not retrieve final Akoma Ntoso XML for {doc_id}.")
        return None

print("Main workflow orchestration function defined.")

Main workflow orchestration function defined.


In [77]:
# Cell 11: Load example and run processing (Revised)

sample_doc_path = "../../raw/UU_8_1961.txt" # <--- ADJUST THIS PATH TO YOUR FILE
document_id_main = "UU_8_1961_vAkn_rev3" # Unique ID for this processing run

initial_metadata_main = {
    "title": "WAJIB KERJA SARJANA",
    "number": "8",
    "year": "1961",
    "doc_subtype": "UNDANG-UNDANG", 
    "country_code": "id"
}

async def main():
    try:
        with open(sample_doc_path, 'r', encoding='utf-8') as f:
            raw_document_text = f.read()
        print(f"Successfully loaded document: {sample_doc_path}, length: {len(raw_document_text)} chars.")
    except FileNotFoundError:
        print(f"Error: Document file not found at {sample_doc_path}. Please check the path.")
        return

    if not await mongo_manager.connect():
        print("MongoDB connection failed. Cannot proceed.")
        return

    # Clean up existing document entry for a fresh run (optional, for testing)
    # print(f"Attempting to clean up existing entry for {document_id_main}...")
    # delete_result = await mongo_manager.documents_collection.delete_one({"document_id": document_id_main})
    # if delete_result.deleted_count > 0:
    #     print(f"Cleaned up existing entry for {document_id_main}.")
    # else:
    #     print(f"No existing entry for {document_id_main} to clean up, or error in deletion.")


    final_akn_xml = await process_document(
        document_text_content=raw_document_text,
        doc_id=document_id_main,
        doc_source_filename=os.path.basename(sample_doc_path),
        initial_doc_metadata=initial_metadata_main,
        chunk_s=2500, 
        chunk_o=400,
        max_agent_iterations_per_chunk=30 # Increased from default 25, adjust as needed
    )

    await mongo_manager.close()

    if final_akn_xml:
        print(f"\nProcessing complete for {document_id_main}.\nFinal XML length: {len(final_akn_xml)}")
    else:
        print(f"\nProcessing for {document_id_main} encountered issues or did not produce XML.")

print("To run the processing, execute 'await main()' in a new cell.")
# If you want to run it directly when the notebook cell is executed:
# import asyncio
# asyncio.run(main())

To run the processing, execute 'await main()' in a new cell.


In [78]:
await main()

Successfully loaded document: ../../raw/UU_8_1961.txt, length: 11846 chars.
Attempting to connect to MongoDB: hukum_terbuka_akn...
✅ Connected to MongoDB: hukum_terbuka_akn, collection: legal_documents_akn
Starting processing for document: UU_8_1961_vAkn_rev3
✅ MongoDB connection already active for: hukum_terbuka_akn
Document divided into 6 chunks.

--- Processing Chunk 1/6 (ID: chunk_001) ---
Content (first 100 chars): PRESIDEN
REPUBLIK INDONESIA
UNDANG-UNDANG REPUBLIK INDONESIA
NOMOR 8 TAHUN 1961
TENTANG
WAJIB KERJA ...
No prior processing state found for UU_8_1961_vAkn_rev3 in DB, or document not initialized. Agent will need to initialize if it's chunk 1.
----
SYSTEM: 
You are an expert legislative informatics assistant specializing in converting Indonesian legal documents into Akoma Ntoso (AKN) v3.0 XML format.
Your primary goal is to analyze chunks of an Indonesian legal text and incrementally construct a well-formed and semantically rich Akoma Ntoso XML docume... (see cell 7 for 