In [27]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [28]:
from package.databases.initialize import initialize_memories
initialize_memories()

In [29]:
from package.utils.data_loder import PDFLoader
from package.interface import SourceOptions
from package.flows.offline import OfflineFlow
from package.databases.management.longterm import LongTermManagement
from package.databases.management.user import UserManagement
from package.databases.management.document import DocumentManagement
from package.databases.management.project import ProjectManagement
# from package.databases.management.jargon import JargonManagement
from package.databases.session import Session, get_session, Depends
from package.databases.models.user import User
from package.databases.models.document import Document
from package.databases.models.project import Project
# from package.databases.models.jargon import Jargon
from package.databases.models.longterm import LongTerm
from package.embedding.baai import BAAIEmbedding
from package.databases.management.term import TermManagement

embedder = BAAIEmbedding()
ltm = LongTermManagement()
dm = DocumentManagement()
# jm = JargonManagement()
um = UserManagement()
pm = ProjectManagement()
tm = TermManagement()

🔍 Loading model from: BAAI/bge-m3


In [30]:
user = User(
    username="bank",
    password="555",
    email="bank@bank.com"
)

um = UserManagement()
user = um.create_user(user, session=Depends(get_session))

In [31]:
user.id

'9f9d04af-4ff7-41b4-bd42-cf5aada7361e'

In [32]:
# document1 = Document(source="./sources/storm.pdf", type="pdf")

# dm = DocumentManagement()
# document1 = dm.create_document(document1, session=Depends(get_session)) 

In [33]:
# document1.id

In [34]:
# dm.read_document_longterms(document_id=document1.id, session=Depends(get_session))

In [35]:
# from glob import glob
# sources = glob("./sources/*")
# sources

In [36]:
import boto3
import json
import uuid
import os
import time
from dotenv import load_dotenv
load_dotenv()
STATE_MACHINE_ARN = os.getenv("STATE_MACHINE_ARN")

def run_stepfunctions(stateMachineArn, document_chunks):

    # Replace this with your actual Step Function ARN

    # Create a unique name for this execution
    execution_name = f"test-execution-{uuid.uuid4()}"

    # Input payload
    input_payload = {
        "document_chunks": document_chunks
    }

    # Create Step Functions client
    sfn = boto3.client("stepfunctions", region_name="ap-southeast-1")

    # Start execution
    response = sfn.start_execution(
        stateMachineArn=stateMachineArn,
        name=execution_name,
        input=json.dumps(input_payload)
    )
    return response

def check_stepfunctions_job(executionArn):
    sfn = boto3.client("stepfunctions", region_name="ap-southeast-1")
    while True:
        desc = sfn.describe_execution(executionArn=executionArn)
        status = desc["status"]
        print(f"Execution status: {status}")
        if status in ("SUCCEEDED", "FAILED", "TIMED_OUT", "ABORTED"):
            break
        time.sleep(2)

    if status == "SUCCEEDED":
        output = desc.get("output")
        print("Execution output:", output)
    else:
        print(f"Execution ended with status: {status}")    

In [37]:
from package.databases.utils import now_utc
def embed(longterms, embedder, session: Session = Depends(get_session)):
    updated_at = now_utc()
    raws = [longterm.raw for longterm in longterms]
    raw_vectors = embedder.run(sentences=raws)
    enrichs = [longterm.enrich for longterm in longterms]
    enrich_vectors = embedder.run(sentences=enrichs)
    combos = [longterm.combo for longterm in longterms]
    combo_vectors = embedder.run(sentences=combos)
    for longterm, vector in zip(longterms, raw_vectors):
        longterm.raw_embedding = vector
    for longterm, vector in zip(longterms, enrich_vectors):
        longterm.enrich_embedding = vector
    for longterm, vector in zip(longterms, combo_vectors):
        longterm.combo_embedding = vector
        longterm.updated_at = updated_at
    ltm.update_longterms(longterms=longterms, session=session)

In [None]:
from pathlib import Path
from tqdm import tqdm

source_dir = Path("./sources")
pdf_files = source_dir.glob("*.pdf")
olf = OfflineFlow()
errors = []

# target = 'Assisting in Writing Wikipedia-like Articles From Scratch with Large Language Models.pdf'
# target = 'AcuRank Uncertainty-Aware Adaptive Computation for Listwise Reranking.pdf'

for pdf_path in tqdm(pdf_files):
    source = pdf_path.name
    # if source == target:
    #     continue  # skip until we reach the target file

    source_type = pdf_path.suffix.lstrip(".")
    source_ops = SourceOptions(
        path=str(pdf_path),
        type=source_type
    )
    document = Document(source=source, type=source_type)
    document = dm.create_document(document, session=Depends(get_session))
    try:
        print(source)
        loader = PDFLoader(source=source_ops)
        contexts = loader.run()


        longterms = olf.run(document_id=document.id, contexts=contexts)
        ltm.create_raws(longterms, session=Depends(get_session))

        longterms = ltm.read_longterms_by_document(document_id=document.id, session=Depends(get_session))
        document_chunks = [{"document_id": document.id, "longterm_id": longterm.id} for longterm in longterms]

        response = run_stepfunctions(stateMachineArn=STATE_MACHINE_ARN, document_chunks=document_chunks)
        executionArn = response["executionArn"]
        check_stepfunctions_job(executionArn)

        longterms = ltm.read_longterms_by_document(document_id=document.id, session=Depends(get_session))
        embed(longterms, embedder=embedder, session=Depends(get_session))

    except Exception as e:
        errors.append((document.id, source, str(e)))

    # break  # ✅ stop the loop after processing the matched file


0it [00:00, ?it/s]

Assisting in Writing Wikipedia-like Articles From Scratch with Large Language Models.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"dce6dadb-ee48-496d-9504-c650384088ad","document_id":"a57cc0be-4da7-4d4f-ba98-5619fe05dd5c"}},{"Payload":{"processed":"1dd848ad-3e31-4a48-bbd9-bc180cd9e2b8","document_id":"a57cc0be-4da7-4d4f-ba98-5619fe05dd5c"}},{"Payload":{"processed":"ae0ee5ef-b27d-4341-9aa4-b2a97e4e388f","document_id":"a57cc0be-4da7-4d4f-ba98-5619fe05dd5c"}},{"Payload":{"processed":"ffa21f8a-4fa3-4893-a6d4-51a2a84fa638","document_id":"a57cc0be-4da7-4d4f-ba98-5619fe05dd5c"}},{"Payload":{"processed":"d0b9d61c-64a0-494c-9df9-4137304d6738","document_id":"a57cc0be-4da7-4d4f-ba98-5619fe05dd5c"}},{"Payload":{"processed":"58545a6e-e598-4450-b0aa-cfff6de6ec02","document_

2it [00:42, 21.34s/it]

ClueAnchor Clue-Anchored Knowledge Reasoning Exploration and Optimization for Retrieval-Augmented Generation.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"8a9a5151-f615-4208-8d9e-f054d070499a","document_id":"0ab7ef49-011d-4409-ac69-254d6b88eba1"}},{"Payload":{"processed":"eb99ff44-11fc-40e8-a350-2789e87d0702","document_id":"0ab7ef49-011d-4409-ac69-254d6b88eba1"}},{"Payload":{"processed":"3fed9c8b-2d31-448a-baac-3b18078432f7","document_id":"0ab7ef49-011d-4409-ac69-254d6b88eba1"}},{"Payload":{"processed":"dd15cac4-604e-47bc-ae55-1d468a4ea33c","document_id":"0ab7ef49-011d-4409-ac69-254d6b88eba1"}},{"Payload

3it [01:56, 43.02s/it]

DiscoVLA Discrepancy Reduction in Vision, Language, and Alignment for Parameter-Efficient Video-Text Retrieval.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"0641fa23-9e7d-418b-b51f-1d06f2f7f58b","document_id":"16514705-ab87-4115-883c-8ebf8ed58f9d"}},{"Payload":{"processed":"cf52305d-178d-45f1-85d4-134fb8ef537f","document_id":"16514705-ab87-4115-883c-8ebf8ed58f9d"}},{"Payload":{"processed":"d0a93a16-d0ce-47a1-8e2f-a154eda19040","document_id":"16514705-ab87-4115-883c-8ebf8ed58f9d"}},{"Payload":{"processed":"4e20e90c-8dd9-4afb-a2c1-a9f942ffc470","document_id":"16514705-ab87-4115-883c-8ebf8ed58f9d"}},{"Payload":{"processed":"a7370980-f81d-4910-8b80-480405ef5bb2","document_id":"16514705-ab87-4115-883c-8ebf8ed58f9d"}},{"Payload":{"processed":"9679a211-9458-422b-93db-af8828630bf8","document_id":"16514705-ab87-4115-883c-8ebf8ed58f9d"}},{"Paylo

4it [02:37, 42.53s/it]

EXP4FUSE A RANK FUSION FRAMEWORK FOR ENHANCED SPARSE RETRIEVAL USING LARGE LANGUAGE MODEL-BASED QUERY EXPANSION.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"8c2c2c47-6b32-4e87-8fd6-6a8a80014bff","document_id":"c73c3a63-4d4a-438e-877c-54a7e6067d2d"}},{"Payload":{"processed":"1cec6952-c79d-46a3-b2f2-59ba2f5d6bb9","document_id":"c73c3a63-4d4a-438e-877c-54a7e6067d2d"}},{"Payload":{"processed":"c0978d4b-8ce4-4786-aa7b-11abe458e16d","document_id":"c73c3a63-4d4a-438e-877c-54a7e6067d2d"}},{"Payload":{"processed":"34b47d4c-b1a0-42b9-aae1-2b04f2e28351","document_id":"c73c3a63-4d4a-438e-877c-54a7e6067d2d"}},{"Payload":{"processed":"024947ed-f9ff-496a-a352-a7f5b7f18405","document_id":"c73c3a63-4d4a-438e-877c-54a7e6067d2d"}},{"Payload":{"processed":"5d392769-9403-492c-be2

5it [03:17, 41.66s/it]

GainRAG Preference Alignment in Retrieval-Augmented Generation through Gain Signal Synthesis.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"5bff8a90-0d88-4ea7-afc4-2d17bc349f48","document_id":"21a9b572-b1be-43e7-a143-b391ab52e966"}},{"Payload":{"processed":"e6afef03-75cd-4cc7-ba27-ea4381174816","document_id":"21a9b572-b1be-43e7-a143-b391ab52e966"}},{"Payload":{"processed":"b20dc8f7-9b65-4053-8415-c09a922d4454","document_id":"21a9b572-b1be-43e7-a143-b391ab52e966"}},{"Payload":{"processed":"07c53a93-2c06-46b3-9876-c1450cdb4866","document_id":"21a9b572-b1be-43e7-a143-b391ab52e966"}},{"Payload":{"processed":"06c2def9-914d-458f-8b29-115d4d1add50","document_id":"21a9b572-b1be-43e7-a143-b391ab52e966"}},{"Payload":{"processed":"383f25aa-41dd-41f9-abfc-50a54ea0341a","document_id":"21a9b572-b1be-43e7-a143-b391ab52e966"}},{"Payload":{"processed":"78fdf576

6it [03:45, 36.99s/it]

GenKI Enhancing Open-Domain Question Answering with Knowledge Integration and Controllable Generation in Large Language Models.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"f3447d9f-3909-414c-ac26-8a86296fe3eb","document_id":"93c2444d-f2c3-4609-8bd9-badaa0930ac2"}},{"Payload":{"processed":"c9e58683-8f2c-4298-bf1f-e79f135e5e10","document_id":"93c2444d-f2c3-4609-8bd9-badaa0930ac2"}},{"Payload":{"processed":"afaee9ce-fb09-4fa7-8ccd-956d3051c3ce","document_id":"93c2444d-f2c3-4609-8bd9-badaa0930ac2"}},{"Payload":{"processed":"409a12b0-2fab-45f3-ab62-75e5913c4795","document_id":"93c2444d-f2c3-4609-8bd9-badaa0930ac2"}},{"Payload":{"processed":"4eeff545-efcc-4e26-8dd1-686265144920","document_id":"93c2444d-f2c3-4609-8bd9-badaa0930ac2"}},{"Payload":{"processed":"d0414251-25ee-4c6b-b4a0-59ec7787ae0e","document_

7it [04:25, 38.12s/it]

HippoRAG Neurobiologically Inspired Long-Term Memory for Large Language Models.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"d91bb241-2766-4e8c-8ad1-1ae8b3dca7be","document_id":"d62d0f7b-ea37-42f7-8402-cf68a4541717"}},{"Payload":{"processed":"d424dcdd-a14f-407d-b1a0-ba69071645f8","document_id":"d62d0f7b-ea37-42f7-8402-cf68a4541717"}},{"Payload":{"processed":"b44a491b-23a2-4424-86c8-9195583ec95f","document_id":"d62d0f7b-ea37-42f7-8402-cf68a4541717"}},{"Payload":{"processed":"b232c452-814e-4dcf-9293-5734e9426822","document_id":"d62d0f7b-ea37-42f7-8402-cf68a4541717"}},{"Payload":{"processed":"cde5d5b8-11f6-4b31-87d7-f52c9ae06367

8it [05:32, 47.09s/it]

LlamaRec-LKG-RAG A Single-Pass, Learnable Knowledge Graph-RAG Framework for LLM-Based Ranking.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(3)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"23ded16a-468f-4765-b332-c4bd39737634","document_id":"bee0068f-e22d-42be-9139-064d76f655b9"}},{"Payload":{"processed":"7de3a046-47b5-4e07-9a73-7397105a884f","document_id":"bee0068f-e22d-42be-9139-064d76f655b9"}},{"Payload":{"processed":"c4471dc9-8052-418d-9f16-6965fc00cccc","document_id":"bee0068f-e22d-42be-9139-064d76f655b9"}},{"Payload":{"processed":"e5e836c2-11f5-4ef2-b8c8-d4e943223527","document_id":"bee0068f-e22d-42be-9139-064d76f655b9"}},{"Payload":{"processed":"e6ace379-50bd-44e5-9e94-25bbe10959d1","document_id":"bee0068f-e22d-42be-9139-064d76f655b9"}},{"Payload":{"processed":"8039e848-2e6d-4740-811f-07dd44272279","document_id":"bee0068f-e22d-42be-9139-064d76f655b9"}},{"Payload":{"processed":"9b22acb1

9it [06:06, 43.10s/it]

LOGICOL Logically-Informed Contrastive Learning for Set-based Dense Retrieval.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"a0fbef56-1e83-4276-b5cf-5285f6aa67f8","document_id":"d7d8642f-4151-42b1-b901-f821056f6e2d"}},{"Payload":{"processed":"9adf394d-cb57-4f85-817e-c94fe59a367a","document_id":"d7d8642f-4151-42b1-b901-f821056f6e2d"}},{"Payload":{"processed":"ab1656ab-b71e-4f5e-9d20-81fd8ced9737","document_id":"d7d8642f-4151-42b1-b901-f821056f6e2d"}},{"Payload":{"processed":"b3dea886-01e8-4394-a148-c58a0ec4c34e","document_id":"d7d8642f-4151-42b1-b901-f821056f6e2d"}},{"Payload":{"processed":"e69201d6-869a-4333-b676-0063c0b9f996","document_id":"d7d8642f-4151-42b1-b901-f821056f6e2d"}},{"Payload":{"processed":"b76b2d7b-0de0-41cc-9a2f-679d0f8bf56a","document_id":"d7d8642f-4151-42b1-b901-f821056f6e2d"}},{"Payload":{"processed":"2aa53b65

10it [06:31, 37.62s/it]

MASKSEARCH A Universal Pre-Training Framework to Enhance Agentic Search Capability.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"1fdc3d97-2e27-4580-9efb-61aa6f684689","document_id":"24a2dcc3-c8ec-445e-8336-816971796e30"}},{"Payload":{"processed":"c5922295-67a0-4683-ac6d-126a4231ff0e","document_id":"24a2dcc3-c8ec-445e-8336-816971796e30"}},{"Payload":{"processed":"c732fddb-803b-49b0-94cd-d19fd2e63b3b","document_id":"24a2dcc3-c8ec-445e-8336-816971796e30"}},{"Payload":{"processed":"62abdcca-fb9e-4d97-bcfc-117bddbdb723","document_id":"24a2dcc3-c8ec-445e-8336-816971796e30"}},{"Payload":{"processed":"69213218-901a-4b91-b2bd-4ad703b4e3cf","document_id":"24a2dcc3-c8ec-445e-8336-816971796e30"}},{"Payload":{"processed":"f3c5f917-afd6-41b6-9c60-15b15271d129","document_

11it [07:21, 41.37s/it]

PAKTON A Multi-Agent Framework for Question Answering in Long Legal Agreements.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"fc64ce28-83c4-4f39-8262-ed7bc5d34450","document_id":"cbd3c93d-09d3-4cc2-bc95-d36c2e4aefa9"}},{"Payload":{"processed":"c2970118-4b50-43ee-b82c-3b4294f4f487","document_id":"cbd3c93d-09d3-4cc2-bc95-d36c2e4aefa9"}},{"Payload":{"processed":"20e127c9-15d0-4889-9cf3-950e8eded22e","document_id":"cbd3c93d-09d3-4cc2-bc95-d36c2e4aefa9"}},{"Payload":{"processed":"74f1109b-36ba-4f3b-9fda-de6d86dddf1a","document_id":"cbd3c93d-09d3-4cc2-bc95-d36c2e4aefa9"}},{"Payload":{"processed":"f134a74b-c

12it [08:43, 53.48s/it]

POQD Performance-Oriented Query Decomposer for Multi-vector retrieval.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"873513f1-4914-4379-aa99-52e76f0a1eb4","document_id":"9099006f-03c4-4b0d-a7f3-0538fea835c6"}},{"Payload":{"processed":"5388e8d2-a5a9-4738-a38e-a3cb2c7ccd57","document_id":"9099006f-03c4-4b0d-a7f3-0538fea835c6"}},{"Payload":{"processed":"ca7b857a-fa02-4d97-b0a3-f01c09d65dda","document_id":"9099006f-03c4-4b0d-a7f3-0538fea835c6"}},{"Payload":{"processed":"67e240f9-3c3f-44bb-93fd-40aff2ae7b32","document_id":"9099006f-03c4-4b0d-a7f3-0538fea835c6"}},{"Payload":{"processed":"ad88cacb-0a66-4234-9e32-b42f8f2c67c0","document_id":"9099006f-03c4-4b0d-a7f3-0538fea835c6"}},{"Payload":{"processed":"162dc340-be97-4999-8e59-fcbed72f813e","document_id":"9099006f-03c4-4b0d-a7

13it [09:22, 49.05s/it]

RARE Retrieval-Aware Robustness Evaluation for Retrieval-Augmented Generation Systems.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(3)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"a954e0c4-1e97-4f42-80e4-f2b31d609b72","document_id":"050e6a5c-8b0f-4649-bd05-547cf962f701"}},{"Payload":{"processed":"a7f61736-42ad-4cfd-b8a6-61a40c49741d","document_id":"050e6a5c-8b0f-4649-bd05-547cf962f701"}},{"Payload":{"processed":"313bd81d-a542-4a06-8a2a-a95b028e3e4c","document_id":"050e6a5c-8b0f-4649-bd05-547cf962f701"}},{"Payload":{"processed":"eb7edea2-ceb6-4a23-80af-15c28a6e17e4","document_id":"050e6a5c-8b0f-4649-bd05-547cf962f701"}},{"Payload":{"processed":"26bc8ea0-fdcd-4d61-a60f-78ca58b451f5","document_id":"050e6a5c-8b0f-4649-bd05-547cf962f701"}},{"Payload":{"processed":"8f6c9066-2ac4-4216-bac6-4c2624ec83e7","document_id":"050e6a5c-8b0f-4649-bd05-547cf962f701"}},{"Paylo

14it [09:58, 45.19s/it]

REARANK Reasoning Re-ranking Agent via Reinforcement Learning.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"985e8aa3-d2a9-4fc8-b2dd-a1b5cc522dee","document_id":"77b299e1-3a35-45f1-9bee-b943d5764c10"}},{"Payload":{"processed":"89f1893c-3f02-4ead-b701-6f299c207cfd","document_id":"77b299e1-3a35-45f1-9bee-b943d5764c10"}},{"Payload":{"processed":"87857a7e-6fed-4a43-950b-d95e1929d7f1","document_id":"77b299e1-3a35-45f1-9bee-b943d5764c10"}},{"Payload":{"processed":"b90774f3-2d1d-4449-be0c-2b67b8556121","document_id":"77b299e1-3a35-45f1-9bee-b943d5764c10"}},{"Payload":{"processed":"84fb1955-a50f-42b7-a69a-6921b4380f32","document_id":"77b299e1-3a35-45f1-9bee-b943d5764c10"}},{"Payload":{"processed":"a51d03ea-056e-4b79-8920-4718322807ce","document_id":"77b299e1-3a35-45f1-9bee-b943d5764c10"}},{"Paylo

15it [10:43, 45.35s/it]

SlideCoder Layout-aware RAG-enhanced Hierarchical Slide Generation from Design.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"19976b15-dfff-4b0c-bdf0-94dc9c8476df","document_id":"52a735db-6cf1-402c-be43-026754b90479"}},{"Payload":{"processed":"5ef64ed4-3f4f-4626-b283-e06a84e9bc61","document_id":"52a735db-6cf1-402c-be43-026754b90479"}},{"Payload":{"processed":"b976a026-439b-49e8-a71e-04ea40f558dc","document_id":"52a735db-6cf1-402c-be43-026754b90479"}},{"Payload":{"processed":"e3b87d86-cea9-4912-b560-8179104d88e1","document_id":"52a735db-6cf1-402c-be43-026754b90479"}},{"Payload":{"processed":"1dff201a-5df2-4acc-85b4-4e5f92e35b45","document_id":"52a735db-6cf1-402c-be43-026754b90479"}},{"Payload":{"processed":"f1dc1612-7b4a-413c-bd17-baae86179ac5","document_

16it [11:21, 43.06s/it]

SORCE Small Object Retrieval in Complex Environments.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"d3dea7ab-4900-423c-9d5d-51b986e2da83","document_id":"25d85652-170a-4bbe-8f9e-76c03bc4b75c"}},{"Payload":{"processed":"b4fff77e-f124-465d-9f5b-07dc2278f556","document_id":"25d85652-170a-4bbe-8f9e-76c03bc4b75c"}},{"Payload":{"processed":"62dceb01-b7b5-4c2a-9a52-e825ce0c3337","document_id":"25d85652-170a-4bbe-8f9e-76c03bc4b75c"}},{"Payload":{"processed":"a05ac0b3-8648-4e2f-8019-fb3511868c76","document_id":"25d85652-170a-4bbe-8f9e-76c03bc4b75c"}},{"Payload":{"processed":"8c80f7d3-fb0e-4519-9872-0bafe30a37fe","document_id":"25d85652-170a-4bbe-8f9e-76c03bc4b75c"}},{"Payload":{"processed":"2e1d2713-dfa8-43eb-9aae-96fd10a83e00","document_id":"25d85652-170a-4bbe-8f9e-76c03bc4b75c"}},{"Paylo

17it [11:56, 40.66s/it]

TracLLM A Generic Framework for Attributing Long Context LLMs.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"38aba611-642c-4c05-990a-9f68bdeacb63","document_id":"a16527e3-50b2-41a2-8c59-9ee0968ce39d"}},{"Payload":{"processed":"0d782daf-faff-48a1-b8ac-3dd11dcd0765","document_id":"a16527e3-50b2-41a2-8c59-9ee0968ce39d"}},{"Payload":{"processed":"e31fc795-a859-424f-9856-fe62cff6d6c3","document_id":"a16527e3-50b2-41a2-8c59-9ee0968ce39d"}},{"Payload":{"processed":"9ea35eb3-5138-40b3-a93c-c19932602c39","document_id":"a16527e3-50b2-41a2-8c59-9ee0968ce39d"}},{"Payload":{"processed":"cc040b97-9b20-4454-ad79-8a5f7e98a8a5","document_id":"a16527e3-50b2-41a2-8c59-9ee0968ce39d"}},{"Payload":{"processed":"a83027d3-5b82-40bf-8704-27e07173c637","document_

18it [12:41, 42.29s/it]


In [42]:
errors

[]

In [52]:
from sqlmodel import Session, select
from package.databases.engine import engine
from package.databases.models.longterm import LongTerm
from package.databases.models.term import Term
from package.databases.models.document import Document

In [53]:
with Session(engine) as session:
    statement = select(Document)
    results = session.exec(statement).all()

len(results)

18

In [54]:
with Session(engine) as session:
    statement = select(LongTerm)
    results = session.exec(statement).all()

len(results)

1903

In [55]:
with Session(engine) as session:
    statement = select(Term)
    results = session.exec(statement).all()

len(results)

6734

In [17]:
longterms = ltm.read_longterms_by_document(document_id=document.id, session=Depends(get_session))
len(longterms)

325

In [19]:
len([l.enrich for l in longterms if l.enrich is None])

296

In [20]:
len([l.enrich for l in longterms if l.enrich])

29

In [23]:
_enrich_none = [l for l in longterms if l.enrich is None]

In [24]:
from package.agents.context_enricher import ContextEnricher
ce = ContextEnricher()
ce.run(_enrich_none[0].raw)

Summary(summary='AcuRank is an uncertainty-aware adaptive computation method for listwise reranking.')

In [16]:
_terms = tm.read_terms(session=Depends(get_session))
[t.term for t in _terms]

['TourRank-1',
 'TourRank-10',
 'TourRank-5',
 'TourRank-10',
 'AcuRank-9',
 'TourRank-2',
 'TourRank-2',
 'TourRank-5',
 'AcuRank-9',
 'AcuRank',
 'BM25',
 'BM25',
 'SW-2',
 'TREC-DL',
 'BEIR',
 'TrueSkill',
 'AcuRank',
 'µ i',
 'topk']

In [22]:
from package.agents.context_enricher import ContextEnricher

ce = ContextEnricher()

error_enrich = []
enrichs = []
for longterm in tqdm(longterms):
    context = longterm.raw
    try:
        enrich = ce.run(context=context)
        enrichs.append(enrich)
    except Exception as e:
        error_enrich.append((longterm.id, str(e)))


100%|██████████| 87/87 [01:35<00:00,  1.10s/it]


In [23]:
len(error_enrich)

0

In [24]:
from package.agents.term_extractor import TermExtractor

te = TermExtractor()

error_term = []
terms = []
for longterm in tqdm(longterms):
    context = longterm.raw
    try:
        term = te.run(context=context)
        terms.extend(term)
    except Exception as e:
        error_term.append((longterm.id, str(e)))

 30%|██▉       | 26/87 [00:58<02:28,  2.44s/it]

[91mBoth parse_structured_output and content_extractor failed:
3 validation errors for ExtractedTerms
terms.1.type
  Input should be 'Acronym', 'Abbreviation', 'Framework', 'Algorithm', 'Technical Term', 'Jargon' or 'Proper Name' [type=literal_error, input_value='Function', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/literal_error
terms.2.type
  Input should be 'Acronym', 'Abbreviation', 'Framework', 'Algorithm', 'Technical Term', 'Jargon' or 'Proper Name' [type=literal_error, input_value='Function', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/literal_error
terms.3.type
  Input should be 'Acronym', 'Abbreviation', 'Framework', 'Algorithm', 'Technical Term', 'Jargon' or 'Proper Name' [type=literal_error, input_value='Function', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/literal_error
BroAgent.parse_structured_output() got multiple values for argument 'text'[0m


 33%|███▎      | 29/87 [01:05<02:26,  2.52s/it]

[91mBoth parse_structured_output and content_extractor failed:
3 validation errors for ExtractedTerms
terms.0.type
  Input should be 'Acronym', 'Abbreviation', 'Framework', 'Algorithm', 'Technical Term', 'Jargon' or 'Proper Name' [type=literal_error, input_value='Variable', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/literal_error
terms.1.type
  Input should be 'Acronym', 'Abbreviation', 'Framework', 'Algorithm', 'Technical Term', 'Jargon' or 'Proper Name' [type=literal_error, input_value='Function', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/literal_error
terms.2.type
  Input should be 'Acronym', 'Abbreviation', 'Framework', 'Algorithm', 'Technical Term', 'Jargon' or 'Proper Name' [type=literal_error, input_value='Function', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/literal_error
BroAgent.parse_structured_output() got multiple values for argument 'text'[0m


 45%|████▍     | 39/87 [01:24<02:03,  2.58s/it]

[91mBoth parse_structured_output and content_extractor failed:
1 validation error for ExtractedTerms
  Invalid JSON: invalid escape at line 25 column 72 [type=json_invalid, input_value='\n{\n    "terms": [\n   ...\n        }\n    ]\n}\n', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid
BroAgent.parse_structured_output() got multiple values for argument 'text'[0m


 71%|███████▏  | 62/87 [02:16<01:15,  3.03s/it]

[91mBoth parse_structured_output and content_extractor failed:
5 validation errors for ExtractedTerms
terms.0.type
  Input should be 'Acronym', 'Abbreviation', 'Framework', 'Algorithm', 'Technical Term', 'Jargon' or 'Proper Name' [type=literal_error, input_value='Function/Method', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/literal_error
terms.1.type
  Input should be 'Acronym', 'Abbreviation', 'Framework', 'Algorithm', 'Technical Term', 'Jargon' or 'Proper Name' [type=literal_error, input_value='Function/Method', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/literal_error
terms.2.type
  Input should be 'Acronym', 'Abbreviation', 'Framework', 'Algorithm', 'Technical Term', 'Jargon' or 'Proper Name' [type=literal_error, input_value='Function/Method', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/literal_error
terms.3.type
  Input should be 'Acronym', 'Abbreviation', 'F

100%|██████████| 87/87 [03:05<00:00,  2.13s/it]


In [25]:
len(error_term)

4

In [26]:
error_term

[('20614436-94eb-4362-8e06-6e4e2b31d673', "'NoneType' object is not iterable"),
 ('f241d31a-5515-4aa6-b31f-514aa803d54b', "'NoneType' object is not iterable"),
 ('2abc00f8-f60f-44ce-aed0-808e2cd852f9', "'NoneType' object is not iterable"),
 ('417612bc-71fb-4f35-8267-6105ab7168aa', "'NoneType' object is not iterable")]

In [29]:
debug = ltm.read_longterm(longterm_id='20614436-94eb-4362-8e06-6e4e2b31d673', session=Depends(get_session))

te.run(debug.raw)

[91mBoth parse_structured_output and content_extractor failed:
3 validation errors for ExtractedTerms
terms.1.type
  Input should be 'Acronym', 'Abbreviation', 'Framework', 'Algorithm', 'Technical Term', 'Jargon' or 'Proper Name' [type=literal_error, input_value='Function', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/literal_error
terms.2.type
  Input should be 'Acronym', 'Abbreviation', 'Framework', 'Algorithm', 'Technical Term', 'Jargon' or 'Proper Name' [type=literal_error, input_value='Function', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/literal_error
terms.3.type
  Input should be 'Acronym', 'Abbreviation', 'Framework', 'Algorithm', 'Technical Term', 'Jargon' or 'Proper Name' [type=literal_error, input_value='Function', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/literal_error
BroAgent.parse_structured_output() got multiple values for argument 'text'[0m


In [31]:
ce.run(debug.raw)

Summary(summary='STORM algorithm generates an outline and references for a given topic, considering related topics and their Wikipedia articles.')

In [32]:
document.id

'2aa78fac-48b2-4a4d-a70d-0fa04f8b9071'

In [33]:
debug.id

'20614436-94eb-4362-8e06-6e4e2b31d673'

In [30]:
_lt = ltm.read_longterms_by_document(document_id=document.id, session=Depends(get_session))
_lt[1].document_id, _lt[1].id

('ea49fae9-14b6-4f4a-a876-e0fe9685118b',
 '4dbe2cbe-55d4-437d-b080-19ef3b4a7a86')

In [37]:
ltm.read_longterm(longterm_id='20614436-94eb-4362-8e06-6e4e2b31d673', session=Depends(get_session)).enrich

'STORM algorithm generates an outline and references for a given topic, considering related topics and their Wikipedia articles.'

In [35]:
tm.read_terms(session=Depends(get_session))

[]

In [13]:
from pathlib import Path
from tqdm import tqdm
source_dir = Path("./sources")
pdf_files = source_dir.glob("*.pdf")  # yields Path objects
olf = OfflineFlow()
errors = []
for pdf_path in tqdm(pdf_files):
    source = pdf_path.name
    if source != 'Assisting in Writing Wikipedia-like Articles From Scratch with Large Language Models.pdf':
        continue
    source_type = pdf_path.suffix.lstrip(".")
    source_ops = SourceOptions(
        path=str(pdf_path),             # e.g., "./sources/Filename.pdf"
        type=source_type  # e.g., "pdf"
    )
    try:
        loader = PDFLoader(source=source_ops)
        contexts = loader.run()
        document = Document(source=source, type=source_type)
        document = dm.create_document(document, session=Depends(get_session))     
        longterms = olf.run(document_id=document.id, contexts=contexts)
        ltm.create_raws(longterms, session=Depends(get_session))
        longterms = ltm.read_longterms_by_document(document_id=document.id, session=Depends(get_session))
        document_chunks = [{"document_id": document.id, "longterm_id": longterm.id} for longterm in longterms]
        response = run_stepfunctions(stateMachineArn=STATE_MACHINE_ARN, document_chunks=document_chunks)
        executionArn = response["executionArn"]
        check_stepfunctions_job(executionArn)
        longterms = ltm.read_longterms_by_document(document_id=document.id, session=Depends(get_session))
        embed(longterms, embedder=embedder, session=Depends(get_session))
    except Exception as e:
        errors.append((source, str(e)))



1it [00:17, 17.83s/it]


KeyboardInterrupt: 

In [12]:
source_ops = SourceOptions(path="./sources/storm.pdf", type="pdf")
loader = PDFLoader(
    source=source_ops
)
contexts = loader.run()

Markdown headings: max(2)


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


In [13]:
olf = OfflineFlow()

In [14]:
longterms = olf.run(document_id=document1.id, contexts=contexts)

In [15]:
ltm = LongTermManagement()
ltm.create_raws(longterms, session=Depends(get_session))

In [16]:
# from package.embedding.baai import BAAIEmbedding
# embedding = BAAIEmbedding()

In [17]:
document1.id

'068ec5c0-f1df-4733-930a-ac8c6e0390aa'

In [18]:
longterms = ltm.read_longterms_by_document(document_id=document1.id, session=Depends(get_session))

In [19]:
# sentences = [longterm.raw for longterm in longterms]
# vectors = embedding.run(sentences=sentences)
# for longterm, vector in zip(longterms, vectors):
#     longterm.raw_embedding = vector
# ltm.update_longterms(longterms=longterms, session=Depends(get_session))

In [20]:
# # query = """What does ROUGE Score do?"""
# query = """How was FreshWiki created?"""
# vector = embedding.run(sentences=[query])[0]
# results = ltm.read_similar_text(vector, limit=15, embed_method="raw", sources=[document1.source], session=Depends(get_session))

In [21]:
# for result in results:
#     print(result.meta)

In [31]:
document_chunks = [{"document_id": document1.id, "longterm_id": longterm.id} for longterm in longterms]

In [32]:
len(document_chunks)

87

In [33]:
document_chunks[0]

{'document_id': '068ec5c0-f1df-4733-930a-ac8c6e0390aa',
 'longterm_id': '8060ff6c-a4e8-4ac4-8b80-a8986bb998ea'}

In [None]:
import boto3
import json
import uuid
import os
from dotenv import load_dotenv
load_dotenv()

# Replace this with your actual Step Function ARN
STATE_MACHINE_ARN = os.getenv("STATE_MACHINE_ARN")

# Create a unique name for this execution
execution_name = f"test-execution-{uuid.uuid4()}"

# Input payload
input_payload = {
    "document_chunks": document_chunks[:]
}

# Create Step Functions client
sfn = boto3.client("stepfunctions", region_name="ap-southeast-1")

# Start execution
response = sfn.start_execution(
    stateMachineArn=STATE_MACHINE_ARN,
    name=execution_name,
    input=json.dumps(input_payload)
)

print("Execution started!")
print("Execution ARN:", response["executionArn"])

Execution started!
Execution ARN: arn:aws:states:ap-southeast-1:112557628841:execution:broai-arai-enrich-fleet:test-execution-bd73a5ad-1420-4b6b-84da-1cc9ef7d3791


In [None]:
import boto3
import json
import uuid
import os
import time
from dotenv import load_dotenv
load_dotenv()
STATE_MACHINE_ARN = os.getenv("STATE_MACHINE_ARN")

def run_stepfunctions(stateMachineArn, document_chunks):

    # Replace this with your actual Step Function ARN

    # Create a unique name for this execution
    execution_name = f"test-execution-{uuid.uuid4()}"

    # Input payload
    input_payload = {
        "document_chunks": document_chunks
    }

    # Create Step Functions client
    sfn = boto3.client("stepfunctions", region_name="ap-southeast-1")

    # Start execution
    response = sfn.start_execution(
        stateMachineArn=stateMachineArn,
        name=execution_name,
        input=json.dumps(input_payload)
    )
    return response["executionArn"]

def check_job(execution_arn):
    sfn = boto3.client("stepfunctions", region_name="ap-southeast-1")
    while True:
        desc = sfn.describe_execution(executionArn=execution_arn)
        status = desc["status"]
        print(f"Execution status: {status}")
        if status in ("SUCCEEDED", "FAILED", "TIMED_OUT", "ABORTED"):
            break
        time.sleep(2)

    if status == "SUCCEEDED":
        output = desc.get("output")
        print("Execution output:", output)
    else:
        print(f"Execution ended with status: {status}")    

In [35]:
import time

execution_arn = response["executionArn"]

while True:
    desc = sfn.describe_execution(executionArn=execution_arn)
    status = desc["status"]
    print(f"Execution status: {status}")
    if status in ("SUCCEEDED", "FAILED", "TIMED_OUT", "ABORTED"):
        break
    time.sleep(2)

if status == "SUCCEEDED":
    output = desc.get("output")
    print("Execution output:", output)
else:
    print(f"Execution ended with status: {status}")

Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"8060ff6c-a4e8-4ac4-8b80-a8986bb998ea","document_id":"068ec5c0-f1df-4733-930a-ac8c6e0390aa"}},{"Payload":{"processed":"eff3ec42-9c55-43c1-8e2c-bfd264fa5e7b","document_id":"068ec5c0-f1df-4733-930a-ac8c6e0390aa"}},{"Payload":{"processed":"3434c261-0fba-4a6b-a2b7-d8daa760f286","document_id":"068ec5c0-f1df-4733-930a-ac8c6e0390aa"}},{"Payload":{"processed":"cbb26c05-c514-4eb0-b09d-6800b7975fff","document_id":"068ec5c0-f1df-4733-930a-ac8c6e0390aa"}},{"Payload":{"processed":"28982ce4-defb-4f37-991f-a2058080ccbe","document_id":"068ec5c0-f1df-4733-930a-ac8c6e0390aa"}},{"Payload":{"processed":"34fad79f-9e28-4a5e-821b-cf04b48e0091","document_id":"068ec5c0-f1df-4733-930a-ac8c6e0390aa"}},{"Payload":{"processed":"1d756736

In [5]:
jm = JargonManagement()
dm = DocumentManagement()

In [6]:
jargons = jm.read_jargons(session=Depends(get_session))

In [13]:
from package.llm.ollama import BedrockOllamaChat
keyword = "FreshWiki"
evidences = [jargon.evidence for jargon in jargons if keyword.lower() in jargon.jargon.lower()]
jargon_evidence = "\n".join(evidences)
# query = f"What does {keyword} stand for?"
query = f"What is {keyword}?"
system_prompt = "Only edit QUERY based on a provided EVIDENCE."
model = BedrockOllamaChat()

messages = [model.UserMessage(text=f"EVIDENCE:\n\n{jargon_evidence}\n\nQUERY:\n\n{query}\n\n")]

with_jargon = model.run(
    system_prompt=system_prompt,
    messages=messages
)
print(with_jargon)

Based on the provided EVIDENCE, I can update the QUERY as follows:

What is the FreshWiki dataset?


In [15]:
evidences

['we randomly select 100 samples from the FreshWiki dataset',
 'We curate the FreshWiki dataset']

In [14]:
vector = embedder.run(sentences=[query])[0]
longterms = ltm.read_similar_text(vector, embed_method="raw", session=Depends(get_session))
_contexts = "\n".join([longterm.combo for longterm in longterms])
context_query = f"CONTEXT:\n\n{_contexts}\n\nQUERY:\n\n{query}\n\n"
rag_model = model.run(
    system_prompt="You are a helpful assistant.",
    messages=[model.UserMessage(text=context_query)]
)
print(rag_model)

FreshWiki is a dataset created by selecting recent Wikipedia articles (from February 2022 to September 2023) with high edit counts, B-class quality or above, and excluding list articles and those without subsections. It is used as a reference for research in generating Wikipedia-like articles from scratch, focusing on the pre-writing stage.


In [9]:
bare_model = model.run(
    system_prompt="You are a helpful assistant.",
    messages=[model.UserMessage(text=query)]
)
print(bare_model)

I couldn't find any information on "FreshWiki." It's possible that it's a lesser-known or outdated platform, or it could be a misspelling or variation of a different term.

However, I can suggest some alternatives that might be related:

1. Freshdesk: Freshdesk is a cloud-based customer support software that provides a range of features for managing customer interactions, including ticketing, chat, and knowledge base management.
2. FreshBooks: FreshBooks is an accounting and invoicing software designed for small businesses and freelancers. It provides features for tracking time, creating invoices, and managing expenses.
3. Wiki: A wiki is a type of collaborative online platform that allows users to create, edit, and share content. The most well-known example is Wikipedia, a free online encyclopedia.

If you have any more information or context about FreshWiki, I may be able to help you better.


In [38]:
len(jm.read_jargons(session=Depends(get_session)))

218

In [5]:
dm = DocumentManagement()
documents = dm.read_documents(session=Depends(get_session))
document1 = documents[0]

In [7]:
ltm = LongTermManagement()
_longterms = ltm.read_longterms_by_document(document_id=document1.id, session=Depends(get_session))
len(_longterms)

87

In [8]:
from package.databases.utils import now_utc
def embed(longterms, embedder, session: Session = Depends(get_session)):
    updated_at = now_utc()
    raws = [longterm.raw for longterm in longterms]
    raw_vectors = embedder.run(sentences=raws)
    enrichs = [longterm.enrich for longterm in longterms]
    enrich_vectors = embedder.run(sentences=enrichs)
    combos = [longterm.combo for longterm in longterms]
    combo_vectors = embedder.run(sentences=combos)
    for longterm, vector in zip(longterms, raw_vectors):
        longterm.raw_embedding = vector
    for longterm, vector in zip(longterms, enrich_vectors):
        longterm.enrich_embedding = vector
    for longterm, vector in zip(longterms, combo_vectors):
        longterm.combo_embedding = vector
        longterm.updated_at = updated_at
    ltm.update_longterms(longterms=longterms, session=session)

In [9]:
from package.embedding.baai import BAAIEmbedding
embedder = BAAIEmbedding()
embed(_longterms, embedder=embedder, session=Depends(get_session))

🔍 Loading model from: BAAI/bge-m3


Fetching 30 files: 100%|██████████| 30/30 [00:00<?, ?it/s]
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [10]:
ltm.read_longterms_by_document(document_id=document1.id, session=Depends(get_session))[0]

LongTerm(id='28982ce4-defb-4f37-991f-a2058080ccbe', enrich='The STORM paradigm is proposed to improve the pre-writing stage of long-form article generation by leveraging external sources, multi-perspective question asking, and iterative research to create a high-quality outline that can be expanded into a full-length article.', combo="The STORM paradigm is proposed to improve the pre-writing stage of long-form article generation by leveraging external sources, multi-perspective question asking, and iterative research to create a high-quality outline that can be expanded into a full-length article.\n\noutlines or even entire articles ( Direct Gen ). However, this approach is limited by a lack of details and hallucinations (Xu et al., 2023), particularly in addressing long-tail topics (Kandpal et al., 2023). This underscores the importance of leveraging external sources, and current strategies often involve retrieval-augmented generation ( RAG ), which circles back to the problem of rese

In [33]:
vector = embedder.run(sentences=["How was FreshWiki created?"])[0]
response = ltm.read_similar_text(vector=vector, embed_method="raw", session=Depends(get_session))
for r in response:
    print(r.meta)

{'type': 'pdf', 'source': './sources/storm.pdf', 'section': '2 FreshWiki', 'sequence': 5}
{'type': 'pdf', 'source': './sources/storm.pdf', 'section': '2.1 The FreshWiki Dataset', 'sequence': 7}
{'type': 'pdf', 'source': './sources/storm.pdf', 'section': 'A Dataset Details', 'sequence': 40}
{'type': 'pdf', 'source': './sources/storm.pdf', 'section': '8 Conclusion', 'sequence': 29}
{'type': 'pdf', 'source': './sources/storm.pdf', 'section': 'Abstract', 'sequence': 2}


In [34]:
# vector = embedder.run(sentences=["How was FreshWiki created?"])[0]
response = ltm.read_similar_text(vector=vector, embed_method="enrich", session=Depends(get_session))
for r in response:
    print(r.meta)

{'type': 'pdf', 'source': './sources/storm.pdf', 'section': '2.1 The FreshWiki Dataset', 'sequence': 7}
{'type': 'pdf', 'source': './sources/storm.pdf', 'section': 'A Dataset Details', 'sequence': 40}
{'type': 'pdf', 'source': './sources/storm.pdf', 'section': '4 Experiments', 'sequence': 14}
{'type': 'pdf', 'source': './sources/storm.pdf', 'section': '## Assisting in Writing Wikipedia-like Articles From Scratch with Large Language Models', 'sequence': 0}
{'type': 'pdf', 'source': './sources/storm.pdf', 'section': '2 FreshWiki', 'sequence': 5}


In [35]:
# vector = embedder.run(sentences=["How was FreshWiki created?"])[0]
response = ltm.read_similar_text(vector=vector, embed_method="combo", session=Depends(get_session))
for r in response:
    print(r.meta)

{'type': 'pdf', 'source': './sources/storm.pdf', 'section': '2.1 The FreshWiki Dataset', 'sequence': 7}
{'type': 'pdf', 'source': './sources/storm.pdf', 'section': 'A Dataset Details', 'sequence': 40}
{'type': 'pdf', 'source': './sources/storm.pdf', 'section': '2 FreshWiki', 'sequence': 5}
{'type': 'pdf', 'source': './sources/storm.pdf', 'section': '8 Conclusion', 'sequence': 29}
{'type': 'pdf', 'source': './sources/storm.pdf', 'section': '4 Experiments', 'sequence': 14}


In [7]:
from package.databases.engine import engine
from sqlalchemy import select, func, or_
from sqlmodel import Session

def search_jargon(term: str):
    ts_query = func.to_tsquery("english", term)
    with Session(engine) as session:
        stmt = select(Jargon).where(
            or_(
                Jargon.search_vector.op('@@')(ts_query),
                func.similarity(Jargon.jargon, term) > 0.3
                
            )
            )
        results = session.exec(stmt).all()
        return results


results = search_jargon("freshwiki")
len(results)

2

In [8]:
results

[(Jargon(explanation='FreshWiki is a dataset used for the experiment', id='bbf62a78-22cc-4c9b-9e59-25396c54edfe', meta={'type': 'pdf', 'source': './sour ... (205 characters truncated) ... Wiki', document_id='068ec5c0-f1df-4733-930a-ac8c6e0390aa', created_at=datetime.datetime(2025, 7, 2, 13, 27, 3, 148137), search_vector="'freshwiki':1"),),
 (Jargon(explanation='FreshWiki is a dataset for studying the generation of grounded longform articles', id='7817c938-ba70-449b-8517-d2c45e146d42', meta ... (212 characters truncated) ... Wiki', document_id='068ec5c0-f1df-4733-930a-ac8c6e0390aa', created_at=datetime.datetime(2025, 7, 2, 13, 27, 4, 141963), search_vector="'freshwiki':1"),)]

In [71]:
results[5][0].evidence

'We propose STORM , a writing system for the S ynthesis of T opic O utlines through R etrieval and M ulti-perspective Question Asking'

In [72]:
texts = [result[0].evidence for result in results]

In [75]:
print("\n".join(texts))

STORM simulates a conversation between a Wikipedia writer and a topic expert
STORM is capable of researching complicated topics and writing long articles from detailed outlines
STORM creates an outline before the actual writing starts
Given the input topic t , STORM discovers different perspectives by surveying existing articles from similar topics
We present STORM to automate the pre-writing stage
We propose STORM , a writing system for the S ynthesis of T opic O utlines through R etrieval and M ulti-perspective Question Asking
Algorithm 1: STORM
we introduce STORM, a framework that automates the pre-writing stage
articles produced by STORM
STORM prompts LLMs to ask effective questions by discovering specific perspectives and simulating multi-turn conversations
we propose the STORM paradigm for the S ynthesis of T opic O utlines through R etrieval and M ulti-perspective Question Asking
STORM             | 45.82
4.4 STORM Implementation
STORM
articles produced by STORM
We propose STORM

In [19]:
from package.agents.jargon_extractor import JargonExtractor

je = JargonExtractor()

In [25]:
batch = 10

jargons = []

for b in range(batch):
    context = longterms[b].raw
    j = je.run(context=context)
    jargons.extend(j)



In [29]:
keyword = "rag"

[jargon for jargon in jargons if keyword in jargon.jargon.lower()]

[Jargon(jargon='RAG', evidence='current strategies often involve retrieval-augmented generation (RAG)', explanation='RAG is short for retrieval-augmented generation'),
 Jargon(jargon='RAG', evidence='retrieval-augmented generation ( RAG )', explanation='RAG is short for retrieval-augmented generation')]

In [26]:
from package.databases.destroy import drop_all_tables

drop_all_tables()

✅ All tables dropped.
