In [None]:
import os
import re
from pathlib import Path
from dotenv import load_dotenv

load_dotenv()

from lightrag import LightRAG
from lightrag.base import DocStatus
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
from lightrag.kg.shared_storage import initialize_pipeline_status
from lightrag.utils import EmbeddingFunc

LLM_BASE_URL = (
    os.getenv("OPENAI_COMPAT_BASE_URL")
    or os.getenv("LLM_BINDING_HOST")
    or os.getenv("OPENAI_API_BASE")
)
LLM_API_KEY = (
    os.getenv("OPENAI_COMPAT_API_KEY")
    or os.getenv("LLM_BINDING_API_KEY")
    or os.getenv("OPENAI_API_KEY")
)
LLM_MODEL = os.getenv("OPENAI_COMPAT_MODEL") or os.getenv("LLM_MODEL", "gpt-4o-mini")

EMBED_BASE_URL = (
    os.getenv("OPENAI_COMPAT_EMBED_BASE_URL")
    or os.getenv("EMBEDDING_BINDING_HOST")
    or LLM_BASE_URL
)
EMBED_API_KEY = (
    os.getenv("OPENAI_COMPAT_EMBED_API_KEY")
    or os.getenv("EMBEDDING_BINDING_API_KEY")
    or LLM_API_KEY
)
EMBED_MODEL = os.getenv("OPENAI_COMPAT_EMBED_MODEL") or os.getenv(
    "EMBEDDING_MODEL", "text-embedding-3-small"
)
EMBED_DIM = int(os.getenv("EMBEDDING_DIM", "1536"))
EMBED_MAX_TOKENS = int(os.getenv("MAX_EMBED_TOKENS", "8192"))

if not LLM_API_KEY:
    raise RuntimeError(
        "Provide LLM credentials via OPENAI_COMPAT_API_KEY, LLM_BINDING_API_KEY, or OPENAI_API_KEY."
    )
if not EMBED_API_KEY:
    raise RuntimeError(
        "Provide embedding credentials via OPENAI_COMPAT_EMBED_API_KEY, EMBEDDING_BINDING_API_KEY, or OPENAI_COMPAT_API_KEY."
    )

async def compat_llm(prompt, system_prompt=None, history_messages=None, **kwargs):
    return await openai_complete_if_cache(
        LLM_MODEL,
        prompt,
        system_prompt=system_prompt,
        history_messages=history_messages or [],
        base_url=LLM_BASE_URL or "https://api.openai.com/v1",
        api_key=LLM_API_KEY,
        **kwargs,
    )

async def compat_embed_func(texts: list[str]):
    return await openai_embed(
        texts,
        model=EMBED_MODEL,
        base_url=EMBED_BASE_URL or "https://api.openai.com/v1",
        api_key=EMBED_API_KEY,
    )

compat_embed = EmbeddingFunc(
    embedding_dim=EMBED_DIM,
    max_token_size=EMBED_MAX_TOKENS,
    func=compat_embed_func,
)

rag = LightRAG(
    working_dir="demo2_storage",
    embedding_func=compat_embed,
    llm_model_func=compat_llm,
)

INPUT_DIR = Path("input_docs")
if not INPUT_DIR.exists():
    raise FileNotFoundError("input_docs directory not found. Add your .txt files before running.")

FILENAME_PATTERN = re.compile(r"(?P<law>\d+)_(?P<year>\d{4})__(?P<article>\d+)\.txt")

docs: list[dict[str, object]] = []
doc_ids: list[str] = []
file_paths_arg: list[str] = []

for txt_path in sorted(INPUT_DIR.glob("*.txt")):
    match = FILENAME_PATTERN.fullmatch(txt_path.name)
    if not match:
        raise ValueError(f"Unexpected file name format: {txt_path.name}")
    law_number = int(match.group("law"))
    year = match.group("year")
    article_number = int(match.group("article"))

    ligji_name = f"Ligji {law_number}/{year}"
    neni_name = f"Neni {article_number} | {ligji_name}"

    manual_graph = {
        "entities": [
            {
                "entity_name": ligji_name,
                "entity_type": "ligj",
            },
            {
                "entity_name": neni_name,
                "entity_type": "neni",
            },
        ],
        "relationships": [
            {
                "src_id": neni_name,
                "tgt_id": ligji_name,
                "description": "part_of",
                "relation_type": "part_of",
            }
        ],
    }

    docs.append(
        {
            "content": txt_path.read_text(encoding="utf-8"),
            "manual_graph": manual_graph,
            "metadata": {
                "ligji_name": ligji_name,
                "neni_name": neni_name,
                "source_file": txt_path.name,
            },
        }
    )
    doc_ids.append(f"doc-{txt_path.stem}")
    file_paths_arg.append(str(txt_path))

if not docs:
    raise RuntimeError("No .txt files found in input_docs directory.")

await rag.initialize_storages()
await initialize_pipeline_status()

track_id = await rag.ainsert(docs, ids=doc_ids, file_paths=file_paths_arg)
print("Track:", track_id)

processed = await rag.doc_status.get_docs_by_status(DocStatus.PROCESSED)
for doc_id, status in processed.items():
    print(doc_id, status.chunks_list)

await rag.finalize_storages()