# main

## imports

In [334]:
# Basic imports
import json
import os
from datetime import datetime
import uuid
from typing import List, Dict, Any, Optional

# OpenAI client
from openai import AsyncOpenAI

# Weave for tracing
import weave

# MongoDB
import pymongo
from motor.motor_asyncio import AsyncIOMotorClient

# Pydantic models (reuse your existing models)
from pydantic import BaseModel, Field


In [335]:
# Initialize Weave Tracing
weave.init('HukumTerbuka')

# Set up the OpenAI client
client = AsyncOpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY") or os.getenv("OPENAI_API_KEY")
)

## chunking

In [336]:
def simple_chunk_text(text: str, chunk_size: int = 2000, overlap: int = 200) -> List[Dict[str, Any]]:
    """
    Simple chunking function that splits text every chunk_size characters
    with optional overlap to maintain context.
    """
    chunks = []
    start = 0
    chunk_id = 1
    
    while start < len(text):
        end = start + chunk_size
        chunk_text = text[start:end]
        
        chunks.append({
            "chunk_id": f"chunk_{chunk_id:03d}",
            "start_pos": start,
            "end_pos": min(end, len(text)),
            "content": chunk_text,
            "char_count": len(chunk_text)
        })
        
        # Move start position, accounting for overlap
        start = end - overlap if end < len(text) else len(text)
        chunk_id += 1
        
        # Safety break to avoid infinite loops
        if chunk_id > 100:  # Adjust as needed
            break
    
    return chunks

# Chunk Sample Doc
# Load the sample document
sample_doc_path = "../../raw/UU_8_1961.txt"

try:
    with open(sample_doc_path, 'r', encoding='utf-8') as f:
        document_text = f.read()
    
    print(f"Loaded document: {len(document_text)} characters")
    print(f"First 500 characters:")
    print(document_text[:500])
    print("...")
    
    # Chunk the document
    chunks = simple_chunk_text(document_text, chunk_size=2000, overlap=200)
    print(f"\nCreated {len(chunks)} chunks")
    
except FileNotFoundError:
    print(f"File not found: {sample_doc_path}")
    print("Please check the file path")

Loaded document: 11846 characters
First 500 characters:
PRESIDEN
REPUBLIK INDONESIA
UNDANG-UNDANG REPUBLIK INDONESIA
NOMOR 8 TAHUN 1961
TENTANG
WAJIB KERJA SARJANA
PRESIDEN REPUBLIK INDONESIA,
Menimbang:a.bahwa ilmu dan keahlian azasnya untuk mengabdi kepada tanah
air, karenanya perlu dikembangkan dan dilaksanakan.
b.bahwa  dalam  rangka  pembangunan  nasional  semesta  berencana
sangat diperlukan tenaga sarjana dari perbagai jurusan;
c.bahwa agar penempatan dan penggunaan tenaga sarjana tersebut
teratur  dan  merata  maka  perlu  diadakan  peraturan
...

Created 7 chunks


## schemas

In [337]:
# Basic document structure models
class DocumentMeta(BaseModel):
    document_type: str  # UU, PP, Perpres, etc.
    number: Optional[str] = None
    year: Optional[str] = None
    title: Optional[str] = None
    authority: Optional[str] = None

class DocumentSection(BaseModel):
    tag: str  # pasal, bab, ayat, konsideran, etc.
    number: Optional[str] = None
    title: Optional[str] = None
    content: str
    subsections: List['DocumentSection'] = []

class LegalDocument(BaseModel):
    meta: DocumentMeta
    preface: List[DocumentSection] = []
    preamble: List[DocumentSection] = []
    body: List[DocumentSection] = []
    conclusions: List[DocumentSection] = []

# Fix the forward reference
DocumentSection.model_rebuild()

In [338]:
# Pydantic models for MongoDB operations
class DocumentMetaInput(BaseModel):
    document_type: str
    number: Optional[str] = None
    year: Optional[str] = None
    title: Optional[str] = None
    authority: Optional[str] = None

class SectionItem(BaseModel):
    tag: str
    number: Optional[str] = None
    title: Optional[str] = None
    content: str
    subsections: List['SectionItem'] = Field(default_factory=list)

# Fix the forward reference
SectionItem.model_rebuild()

class ChunkData(BaseModel):
    chunk_id: str
    start_pos: int
    end_pos: int
    content: str
    char_count: Optional[int] = None

class AnalysisResult(BaseModel):
    identified_sections: List[SectionItem]
    document_meta_found: Optional[DocumentMetaInput] = None
    confidence: float
    notes: str = ""

## tools

### mongodb

In [339]:
import pymongo
from motor.motor_asyncio import AsyncIOMotorClient
import datetime
import uuid

# MongoDB connection setup
class MongoDBManager:
    def __init__(self, connection_string: str = "mongodb://localhost:27017/", db_name: str = "hukum_terbuka"):
        self.connection_string = connection_string
        self.db_name = db_name
        self.client = None
        self.db = None
    
    async def connect(self):
        """Connect to MongoDB"""
        self.client = AsyncIOMotorClient(self.connection_string)
        self.db = self.client[self.db_name]
        
        # Test connection
        try:
            await self.client.admin.command('ping')
            print(f"✅ Connected to MongoDB: {self.db_name}")
            return True
        except Exception as e:
            print(f"❌ MongoDB connection failed: {e}")
            return False
    
    async def close(self):
        """Close MongoDB connection"""
        if self.client:
            self.client.close()

# Initialize MongoDB manager
mongo_manager = MongoDBManager()

In [340]:
# MongoDB document schema - matches our Pydantic models but with MongoDB-specific fields
class MongoDocument:
    """
    MongoDB document structure for legal documents
    
    Structure:
    {
        "_id": ObjectId,
        "document_id": "unique_doc_id",
        "meta": {
            "document_type": "UU",
            "number": "8",
            "year": "1961",
            "title": "...",
            "authority": "...",
            "source_file": "...",
            "created_at": datetime,
            "updated_at": datetime
        },
        "structure": {
            "preface": [...],
            "preamble": [...],
            "body": [...],
            "conclusions": [...]
        },
        "processing_status": {
            "total_chunks": 10,
            "processed_chunks": 5,
            "status": "processing|completed|failed",
            "last_updated": datetime
        },
        "chunks": [
            {
                "chunk_id": "chunk_001",
                "start_pos": 0,
                "end_pos": 2000,
                "content": "...",
                "analysis": {...},
                "processed_at": datetime
            }
        ]
    }
    """
    pass

# Collection names
COLLECTIONS = {
    "documents": "legal_documents",
    "chunks": "document_chunks",
    "processing_logs": "processing_logs"
}


In [341]:
tools = [
    {
        "type": "function",
        "function": {
            "name": "create_document_in_db",
            "description": "Create a new legal document in MongoDB. All metadata fields should be provided; use null for optional text fields if not applicable.",
            "strict": True,
            "parameters": {
                "type": "object",
                "properties": {
                    "document_id": {
                        "type": "string",
                        "description": "Unique identifier for the document."
                    },
                    "meta": {
                        "type": "object",
                        "description": "Metadata for the legal document. All fields (document_type, number, year, title, authority) must be specified; use null for number, year, title, or authority if they are not applicable or unknown.",
                        "properties": {
                            "document_type": {"type": "string", "description": "Type of the document (e.g., UU, PP)."},
                            "number": {"type": ["string", "null"], "description": "Document number. Null if not applicable."},
                            "year": {"type": ["string", "null"], "description": "Year of the document. Null if not applicable."},
                            "title": {"type": ["string", "null"], "description": "Official title of the document. Null if not applicable."},
                            "authority": {"type": ["string", "null"], "description": "Issuing authority. Null if not applicable."}
                        },
                        "required": ["document_type", "number", "year", "title", "authority"],
                        "additionalProperties": False
                    },
                    "source_file": {
                        "type": "string",
                        "description": "Path to the source file of the document."
                    }
                },
                "required": ["document_id", "meta", "source_file"],
                "additionalProperties": False
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "get_document_from_db",
            "description": "Retrieve a legal document from MongoDB using its unique document_id.",
            "strict": True,
            "parameters": {
                "type": "object",
                "properties": {
                    "document_id": {"type": "string", "description": "The unique identifier of the document to retrieve."}
                },
                "required": ["document_id"],
                "additionalProperties": False
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "get_context_for_chunk",
            "description": "Fetch context from previously processed chunks for a given document. Includes previous chunk analyses, overall document structure, and metadata.",
            "strict": True,
            "parameters": {
                "type": "object",
                "properties": {
                    "document_id": {"type": "string", "description": "Unique identifier for the document."},
                    "current_chunk_id": {"type": "string", "description": "ID of the current chunk being processed (e.g., chunk_001)."},
                    "context_window_size": {
                        "type": "integer",
                        "default": 1,
                        "description": "Number of previous chunks to include in the context. Defaults to 1."
                    }
                },
                "required": ["document_id", "current_chunk_id", "context_window_size"],
                "additionalProperties": False
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "update_document_section",
            "description": "Update/replace an entire specific section (preface, preamble, body, or conclusions) of a legal document. The provided section_data will overwrite the existing content of that section. Each item in section_data must be a complete SectionItem object.",
            "strict": True,
            "parameters": {
                "type": "object",
                "properties": {
                    "document_id": {"type": "string", "description": "Unique identifier for the document."},
                    "section_type": {
                        "type": "string",
                        "enum": ["preface", "preamble", "body", "conclusions"],
                        "description": "The type of section to update."
                    },
                    "section_data": {
                        "type": "array",
                        "description": "A list of SectionItem objects that will form the new content for the specified section. Each item must be a complete SectionItem, including all its fields (tag, number, title, content, subsections). Use null for optional text fields and empty list for subsections if not applicable.",
                        "items": {
                            "type": "object",
                            "properties": {
                                "tag": {"type": "string", "description": "Tag of the section item (e.g., pasal, ayat)."},
                                "number": {"type": ["string", "null"], "description": "Number of the section item. Null if not applicable."},
                                "title": {"type": ["string", "null"], "description": "Title of the section item. Null if not applicable."},
                                "content": {"type": "string", "description": "Content of the section item."},
                                "subsections": {
                                    "type": "array",
                                    "description": "Nested subsections. Each must also be a complete SectionItem object. Provide an empty list if no subsections.",
                                    "items": {"$ref": "#/properties/section_data/items"}
                                }
                            },
                            "required": ["tag", "number", "title", "content", "subsections"],
                            "additionalProperties": False
                        }
                    }
                },
                "required": ["document_id", "section_type", "section_data"],
                "additionalProperties": False
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "add_document_section",
            "description": "Add a new item (as a SectionItem object) to a specific document section (preface, preamble, body, or conclusions). The new item is appended to the existing items in that section.",
            "strict": True,
            "parameters": {
                "type": "object",
                "properties": {
                    "document_id": {"type": "string", "description": "Unique identifier for the document."},
                    "section_type": {
                        "type": "string",
                        "enum": ["preface", "preamble", "body", "conclusions"],
                        "description": "The type of section to add the new item to."
                    },
                    "section_item": {
                        "type": "object",
                        "description": "The SectionItem object to add. It must be a complete SectionItem, including all its fields (tag, number, title, content, subsections). Use null for optional text fields and empty list for subsections if not applicable.",
                        "properties": {
                            "tag": {"type": "string", "description": "Tag of the new section item (e.g., pasal, ayat)."},
                            "number": {"type": ["string", "null"], "description": "Number of the new section item. Null if not applicable."},
                            "title": {"type": ["string", "null"], "description": "Title of the new section item. Null if not applicable."},
                            "content": {"type": "string", "description": "Content of the new section item."},
                            "subsections": {
                                "type": "array",
                                "description": "Nested subsections for the new item. Each must also be a complete SectionItem object. Provide an empty list if no subsections.",
                                "items": {"$ref": "#/properties/section_item"}
                            }
                        },
                        "required": ["tag", "number", "title", "content", "subsections"],
                        "additionalProperties": False
                    }
                },
                "required": ["document_id", "section_type", "section_item"],
                "additionalProperties": False
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "store_chunk_analysis",
            "description": "Store the analysis results for a specific chunk of a document. This includes identified sections, found document metadata, confidence score, and notes. All fields within chunk_data and analysis (and their sub-objects like SectionItem or DocumentMetaInput) must be provided; use null or empty strings/lists where appropriate for optional/empty values.",
            "strict": True,
            "parameters": {
                "type": "object",
                "properties": {
                    "document_id": {"type": "string", "description": "Unique identifier for the document."},
                    "chunk_data": {
                        "type": "object",
                        "description": "Information about the processed chunk. All fields must be provided; use null for char_count if not applicable.",
                        "properties": {
                            "chunk_id": {"type": "string", "description": "ID of the chunk."},
                            "start_pos": {"type": "integer", "description": "Start position of the chunk in the original document."},
                            "end_pos": {"type": "integer", "description": "End position of the chunk in the original document."},
                            "content": {"type": "string", "description": "The text content of the chunk."},
                            "char_count": {"type": ["integer", "null"], "description": "Character count of the chunk. Null if not applicable."}
                        },
                        "required": ["chunk_id", "start_pos", "end_pos", "content", "char_count"],
                        "additionalProperties": False
                    },
                    "analysis": {
                        "type": "object",
                        "description": "The analysis result for the chunk. All fields must be provided. For 'document_meta_found', provide the metadata object or null. For 'identified_sections', each item must be a complete SectionItem. 'notes' can be an empty string.",
                        "properties": {
                            "identified_sections": {
                                "type": "array",
                                "description": "List of sections identified in the chunk. Each item must be a complete SectionItem object.",
                                "items": {
                                    "type": "object",
                                    "properties": {
                                        "tag": {"type": "string", "description": "Tag of the identified section (e.g., pasal, ayat)."},
                                        "number": {"type": ["string", "null"], "description": "Number of the section. Null if not applicable."},
                                        "title": {"type": ["string", "null"], "description": "Title of the section. Null if not applicable."},
                                        "content": {"type": "string", "description": "Content of the section."},
                                        "subsections": {
                                            "type": "array",
                                            "description": "Nested subsections. Each must also be a complete SectionItem object. Provide an empty list if no subsections.",
                                            "items": {"$ref": "#/properties/analysis/properties/identified_sections/items"}
                                        }
                                    },
                                    "required": ["tag", "number", "title", "content", "subsections"],
                                    "additionalProperties": False
                                }
                            },
                            "document_meta_found": {
                                "type": ["object", "null"],
                                "description": "Document metadata found in the chunk, or null if none found. If an object, all its fields must be provided (use null for optional text fields).",
                                "properties": {
                                    "document_type": {"type": "string", "description": "Type of the document."},
                                    "number": {"type": ["string", "null"], "description": "Document number. Null if not applicable."},
                                    "year": {"type": ["string", "null"], "description": "Year of the document. Null if not applicable."},
                                    "title": {"type": ["string", "null"], "description": "Title of the document. Null if not applicable."},
                                    "authority": {"type": ["string", "null"], "description": "Issuing authority. Null if not applicable."}
                                },
                                "required": ["document_type", "number", "year", "title", "authority"], # Required if object
                                "additionalProperties": False
                            },
                            "confidence": {"type": "number", "description": "Confidence score of the analysis (0.0 to 1.0)."},
                            "notes": {"type": "string", "description": "Any notes about the analysis. Can be an empty string."}
                        },
                        "required": ["identified_sections", "document_meta_found", "confidence", "notes"],
                        "additionalProperties": False
                    }
                },
                "required": ["document_id", "chunk_data", "analysis"],
                "additionalProperties": False
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "update_processing_status",
            "description": "Update the processing status of a document (e.g., created, processing, completed, failed).",
            "strict": True,
            "parameters": {
                "type": "object",
                "properties": {
                    "document_id": {"type": "string", "description": "Unique identifier for the document."},
                    "status": {
                        "type": "string",
                        "enum": ["created", "processing", "completed", "failed"],
                        "description": "The new processing status."
                    },
                    "total_chunks": {
                        "type": ["integer", "null"],
                        "description": "Total number of chunks for the document. Provide null if not updating or not known."
                    }
                },
                "required": ["document_id", "status", "total_chunks"],
                "additionalProperties": False
            }
        }
    }
]


In [342]:
TOOL_MAPPING = {
    "create_document_in_db": create_document_in_db,
    "get_document_from_db": get_document_from_db,
    "get_context_for_chunk": get_context_for_chunk,
    "update_document_section": update_document_section,
    "add_document_section": add_document_section,
    "store_chunk_analysis": store_chunk_analysis,
    "update_processing_status": update_processing_status
}

In [None]:
# Function tools with proper Pydantic models
async def create_document_in_db(document_id: str, meta: dict, source_file: str) -> str:
    """
    Create a new legal document in MongoDB
    
    Args:
        document_id: Unique identifier for the document
        meta: Document metadata (type, number, year, title, authority)
        source_file: Path to source file
    """
    try:
        # Convert dict to Pydantic model
        meta_model = DocumentMetaInput(**meta)
        meta_dict = meta_model.model_dump()
        
        document = {
            "document_id": document_id,
            "meta": {
                **meta_dict,
                "source_file": source_file,
                "created_at": datetime.datetime.now(datetime.UTC),
                "updated_at": datetime.datetime.now(datetime.UTC)
            },
            "structure": {
                "preface": [],
                "preamble": [],
                "body": [],
                "conclusions": []
            },
            "processing_status": {
                "total_chunks": 0,
                "processed_chunks": 0,
                "status": "created",
                "last_updated": datetime.datetime.now(datetime.UTC)
            },
            "chunks": []
        }
        
        result = await mongo_manager.db[COLLECTIONS["documents"]].insert_one(document)
        return f"Document created with ID: {result.inserted_id}"
        
    except Exception as e:
        return f"Error creating document: {str(e)}"

async def get_document_from_db(document_id: str) -> str:
    """
    Retrieve a legal document from MongoDB
    
    Args:
        document_id: Unique identifier for the document
    """
    try:
        document = await mongo_manager.db[COLLECTIONS["documents"]].find_one(
            {"document_id": document_id}
        )
        
        if document:
            # Convert ObjectId to string for JSON serialization
            document["_id"] = str(document["_id"])
            return json.dumps(document, default=str, ensure_ascii=False, indent=2)
        else:
            return f"Document not found: {document_id}"
            
    except Exception as e:
        return f"Error retrieving document: {str(e)}"

async def update_document_section(document_id: str, section_type: str, section_data: list) -> str:
    """
    Update a specific section of a legal document (PUT operation)
    
    Args:
        document_id: Unique identifier for the document
        section_type: Type of section (preface, preamble, body, conclusions)
        section_data: Section data to update
    """
    try:
        valid_sections = ["preface", "preamble", "body", "conclusions"]
        if section_type not in valid_sections:
            return f"Invalid section type. Must be one of: {valid_sections}"
        
        # Convert list of dicts to Pydantic models
        section_models = [SectionItem(**item) for item in section_data]
        section_data_dict = [item.model_dump() for item in section_models]
        
        update_data = {
            f"structure.{section_type}": section_data_dict,
            "meta.updated_at": datetime.datetime.now(datetime.UTC)
        }
        
        result = await mongo_manager.db[COLLECTIONS["documents"]].update_one(
            {"document_id": document_id},
            {"$set": update_data}
        )
        
        if result.matched_count > 0:
            return f"Updated {section_type} section for document {document_id}"
        else:
            return f"Document not found: {document_id}"
            
    except Exception as e:
        return f"Error updating document section: {str(e)}"

async def add_document_section(document_id: str, section_type: str, section_item: dict) -> str:
    """
    Add a new item to a document section (POST operation)
    
    Args:
        document_id: Unique identifier for the document
        section_type: Type of section (preface, preamble, body, conclusions)
        section_item: New section item to add
    """
    try:
        valid_sections = ["preface", "preamble", "body", "conclusions"]
        if section_type not in valid_sections:
            return f"Invalid section type. Must be one of: {valid_sections}"
        
        # Convert dict to Pydantic model
        section_item_model = SectionItem(**section_item)
        section_item_dict = section_item_model.model_dump()
        
        result = await mongo_manager.db[COLLECTIONS["documents"]].update_one(
            {"document_id": document_id},
            {
                "$push": {f"structure.{section_type}": section_item_dict}, # Push only the section item
                "$set": {"meta.updated_at": datetime.datetime.now(datetime.UTC)}  # Set the timestamp
            }
        )
        
        if result.matched_count > 0:
            return f"Added new item to {section_type} section for document {document_id}"
        else:
            return f"Document not found: {document_id}"
            
    except Exception as e:
        return f"Error adding to document section: {str(e)}"

async def store_chunk_analysis(document_id: str, chunk_data: dict, analysis: dict) -> str:
    """
    Store chunk analysis results in the document (PATCH operation)
    
    Args:
        document_id: Unique identifier for the document
        chunk_data: Chunk information (chunk_id, start_pos, end_pos, content)
        analysis: Analysis results from the agent
    """
    try:
        # Convert dicts to Pydantic models
        chunk_data_model = ChunkData(**chunk_data)
        analysis_model = AnalysisResult(**analysis)
        
        # Convert Pydantic models to dicts
        chunk_data_dict = chunk_data_model.model_dump()
        analysis_dict = analysis_model.model_dump()
        
        chunk_with_analysis = {
            **chunk_data_dict,
            "analysis": analysis_dict,
            "processed_at": datetime.datetime.now(datetime.UTC)
        }
        
        # Add chunk to document
        result = await mongo_manager.db[COLLECTIONS["documents"]].update_one(
            {"document_id": document_id},
            {
                "$push": {"chunks": chunk_with_analysis},
                "$inc": {"processing_status.processed_chunks": 1},
                "$set": {
                    "processing_status.last_updated": datetime.datetime.now(datetime.UTC),
                    "meta.updated_at": datetime.datetime.now(datetime.UTC)
                }
            }
        )
        
        if result.matched_count > 0:
            return f"Stored analysis for chunk {chunk_data_model.chunk_id} in document {document_id}"
        else:
            return f"Document not found: {document_id}"
            
    except Exception as e:
        return f"Error storing chunk analysis: {str(e)}"

async def update_processing_status(document_id: str, status: str, total_chunks: Optional[int] = None) -> str:
    """
    Update document processing status
    
    Args:
        document_id: Unique identifier for the document
        status: New status (created, processing, completed, failed)
        total_chunks: Total number of chunks (optional)
    """
    try:
        update_data = {
            "processing_status.status": status,
            "processing_status.last_updated": datetime.datetime.now(datetime.UTC),
            "meta.updated_at": datetime.datetime.now(datetime.UTC)
        }
        
        if total_chunks is not None:
            update_data["processing_status.total_chunks"] = total_chunks
        
        result = await mongo_manager.db[COLLECTIONS["documents"]].update_one(
            {"document_id": document_id},
            {"$set": update_data}
        )
        
        if result.matched_count > 0:
            return f"Updated processing status to '{status}' for document {document_id}"
        else:
            return f"Document not found: {document_id}"
            
    except Exception as e:
        return f"Error updating processing status: {str(e)}"
    
async def get_context_for_chunk(document_id: str, current_chunk_id: str, context_window_size: int = 1) -> str:
    """
    Fetch context from previously processed chunks
    
    Args:
        document_id: Unique identifier for the document
        current_chunk_id: ID of the current chunk being processed
        context_window_size: Number of previous chunks to include in context
    """
    try:
        # Get the document
        document = await mongo_manager.db[COLLECTIONS["documents"]].find_one(
            {"document_id": document_id}
        )
        
        if not document:
            return f"Document not found: {document_id}"
        
        # Extract current chunk number
        try:
            current_chunk_num = int(current_chunk_id.split('_')[-1])
        except:
            return "Invalid chunk ID format"
        
        # Find previous chunks
        previous_chunks = []
        for chunk in document.get("chunks", []):
            try:
                chunk_num = int(chunk["chunk_id"].split('_')[-1])
                if chunk_num < current_chunk_num and chunk_num >= current_chunk_num - context_window_size:
                    previous_chunks.append(chunk)
            except:
                continue
        
        # Sort by chunk number
        previous_chunks.sort(key=lambda x: int(x["chunk_id"].split('_')[-1]))
        
        # Format context information
        context = {
            "previous_chunks": previous_chunks,
            "document_structure": document.get("structure", {}),
            "metadata": document.get("meta", {})
        }
        
        return json.dumps(context, default=str, ensure_ascii=False, indent=2)
        
    except Exception as e:
        return f"Error fetching context: {str(e)}"

## main agent

In [350]:
class ChunkAnalysis(BaseModel):
    chunk_id: str
    identified_sections: List[DocumentSection]
    document_meta_found: Optional[DocumentMeta] = None
    confidence: float  # 0.0 to 1.0
    notes: str = ""

response_format = {
    "type": "json_schema",
    "json_schema": {
        "name": "ChunkAnalysis",
        "strict": True,
        "schema": AnalysisResult.model_json_schema()
    }
}

In [351]:
async def run_agentic_loop(system_prompt: str, prompt: str, tools, tool_mapping, response_format=None):
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt}
    ]

    while True:
        response = await client.chat.completions.create(
            # model="openai/gpt-4.1-mini",
            model="google/gemini-2.5-flash-preview-05-20",
            messages=messages,
            tools=tools,
            response_format=response_format
        )

        message = response.choices[0].message
        # messages.append(message.dict()) # deprecated
        messages.append(message.model_dump())

        if message.tool_calls:
            for tool_call in message.tool_calls:
                tool_name = tool_call.function.name
                tool_args = json.loads(tool_call.function.arguments)
                tool_result = await tool_mapping[tool_name](**tool_args)

                messages.append({
                    "role": "tool",
                    "tool_call_id": tool_call.id,
                    "name": tool_name,
                    "content": json.dumps(tool_result)
                })
        else:
            return message.content

In [352]:
# Available legal document tags/elements
AVAILABLE_TAGS = {
    "structural": ["bab", "bagian", "paragraf", "pasal", "ayat"],
    "preamble": ["konsideran_menimbang", "konsideran_mengingat", "memutuskan"],
    "content": ["definisi", "ketentuan_umum", "ketentuan_khusus", "sanksi", "ketentuan_peralihan", "ketentuan_penutup"],
    "meta": ["judul", "nomor", "tahun", "tentang", "pembentuk"]
}


# Create the document analysis agent
system_prompt=f"""
You are an expert in Indonesian legal documents with access to a MongoDB database.
Your task is to analyze chunks of legal text with awareness of previously processed chunks.

Available tags you can use:
{json.dumps(AVAILABLE_TAGS, indent=2, ensure_ascii=False)}

Your workflow:
1. Decide if you need context from previous chunks
2. If needed, fetch context using get_context_for_chunk
3. Analyze the current chunk with awareness of this context
4. Store your analysis and update the document structure
5. Consider how this chunk connects to previous chunks

When analyzing with context:
- Look for sections that span across chunk boundaries
- Avoid duplicating sections already identified in previous chunks
- Connect partial sections across chunks (e.g., if a pasal starts in one chunk and continues in another)
- Use document structure information to maintain consistency

Available MongoDB tools:
- create_document_in_db: Create a new document
- get_document_from_db: Retrieve existing document
- get_context_for_chunk: Fetch context from previous chunks
- update_document_section: Update entire sections (PUT)
- add_document_section: Add items to sections (POST)
- store_chunk_analysis: Store chunk analysis results
- update_processing_status: Update processing status

Rules:
- Always consider context before making decisions
- Be conservative when deciding what's a new section vs. continuation
- Store your analysis even if you're uncertain
- Add notes about cross-chunk connections

Tips:
- You should check if the db exists and if the document is already in the db
- If the document is not in the db, create it with the initial metadata
- Use the get_context_for_chunk tool to fetch context from previous chunks
- Use the store_chunk_analysis tool to save your analysis results
- Use the update_document_section tool to update sections
- Use the add_document_section tool to add new items to sections
- Use the update_processing_status tool to update the processing status of the document
- If you need to create a new document, use the create_document_in_db tool 
- If you need to retrieve a document, use the get_document_from_db tool
"""

In [358]:
if 'chunks' in locals() and len(chunks) > 0:
    first_chunk = chunks[0]
    
    print(f"Analyzing chunk: {first_chunk['chunk_id']}")
    print(f"Content preview: {first_chunk['content'][:200]}...")
    print("\n" + "="*50)
    
    prompt = f"Analyze this chunk of Indonesian legal document text:\n\nChunk ID: {first_chunk['chunk_id']}\n\nContent:\n{first_chunk['content']}. Add/update document section based on this chunk. Move on to the next chunk if you're finished. Do it until you're finished with all the chunks."

Analyzing chunk: chunk_001
Content preview: PRESIDEN
REPUBLIK INDONESIA
UNDANG-UNDANG REPUBLIK INDONESIA
NOMOR 8 TAHUN 1961
TENTANG
WAJIB KERJA SARJANA
PRESIDEN REPUBLIK INDONESIA,
Menimbang:a.bahwa ilmu dan keahlian azasnya untuk mengabdi kepa...



In [359]:
# result_json = await run_agentic_loop(prompt, tools, TOOL_MAPPING, response_format)
# analysis = AnalysisResult.model_validate_json(result_json)

await mongo_manager.connect() 

result_json = await run_agentic_loop(system_prompt, prompt, tools, TOOL_MAPPING)

✅ Connected to MongoDB: hukum_terbuka


In [357]:
result_json

'The analysis for `chunk_001` has been successfully stored after creating the document.\n\nHere\'s my analysis of the chunk:\n\n**Document Metadata:**\n*   **Document Type:** UNDANG-UNDANG\n*   **Number:** 8\n*   **Year:** 1961\n*   **Tentang (About):** WAJIB KERJA SARJANA\n*   **Pembentuk (Forming Authority):** PRESIDEN REPUBLIK INDONESIA\n\n**Identified Sections:**\n\n*   **Preamble - Menimbang (Considering):**\n    *   **Tag:** `konsideran_menimbang`\n    *   **Content:** Contains reasons for the law, starting with "a.bahwa ilmu dan keahlian azasnya..."\n*   **Preamble - Mengingat (Recalling):**\n    *   **Tag:** `konsideran_mengingat`\n    *   **Content:** Lists legal bases, starting with "a.Pasal 5 ayat (1) jo. pasal 20 ayat (1)..."\n*   **Preamble - MEMUTUSKAN (DECIDES):**\n    *   **Tag:** `memutuskan`\n    *   **Content:** Details decisions made, including revocation and establishment, starting with "I. Mencabut:Undang-undang Nomor 8 tahun 1951..."\n*   **Structural - Pasal 1:*