In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from package.databases.initialize import initialize_memories
initialize_memories()

In [3]:
from package.utils.data_loder import PDFLoader
from package.interface import SourceOptions
from package.flows.offline import OfflineFlow
from package.databases.management.longterm import LongTermManagement
from package.databases.management.user import UserManagement
from package.databases.management.document import DocumentManagement
from package.databases.management.project import ProjectManagement
# from package.databases.management.jargon import JargonManagement
from package.databases.session import Session, get_session, Depends
from package.databases.models.user import User
from package.databases.models.document import Document
from package.databases.models.project import Project
# from package.databases.models.jargon import Jargon
from package.databases.models.longterm import LongTerm
from package.embedding.baai import BAAIEmbedding
from package.databases.management.term import TermManagement

embedder = BAAIEmbedding()
ltm = LongTermManagement()
dm = DocumentManagement()
# jm = JargonManagement()
um = UserManagement()
pm = ProjectManagement()
tm = TermManagement()

  from .autonotebook import tqdm as notebook_tqdm


🔍 Loading model from: BAAI/bge-m3


Fetching 30 files: 100%|██████████| 30/30 [00:00<?, ?it/s]


In [4]:
user = User(
    username="bank",
    password="555",
    email="bank@bank.com"
)

um = UserManagement()
user = um.create_user(user, session=Depends(get_session))

In [5]:
user.id

'f7bfaa2c-9e01-4cad-8dc4-7c740caf4a9f'

In [8]:
import boto3
import json
import uuid
import os
import time
from dotenv import load_dotenv
load_dotenv()
STATE_MACHINE_ARN = os.getenv("STATE_MACHINE_ARN")

def run_stepfunctions(stateMachineArn, document_chunks):

    # Replace this with your actual Step Function ARN

    # Create a unique name for this execution
    execution_name = f"test-execution-{uuid.uuid4()}"

    # Input payload
    input_payload = {
        "document_chunks": document_chunks
    }

    # Create Step Functions client
    sfn = boto3.client("stepfunctions", region_name="ap-southeast-1")

    # Start execution
    response = sfn.start_execution(
        stateMachineArn=stateMachineArn,
        name=execution_name,
        input=json.dumps(input_payload)
    )
    return response

def check_stepfunctions_job(executionArn):
    sfn = boto3.client("stepfunctions", region_name="ap-southeast-1")
    while True:
        desc = sfn.describe_execution(executionArn=executionArn)
        status = desc["status"]
        print(f"Execution status: {status}")
        if status in ("SUCCEEDED", "FAILED", "TIMED_OUT", "ABORTED"):
            break
        time.sleep(2)

    if status == "SUCCEEDED":
        output = desc.get("output")
        print("Execution output:", output)
    else:
        print(f"Execution ended with status: {status}")    

In [9]:
from package.databases.utils import now_utc
def embed(longterms, embedder, session: Session = Depends(get_session)):
    updated_at = now_utc()
    raws = [longterm.raw for longterm in longterms]
    raw_vectors = embedder.run(sentences=raws)
    enrichs = [longterm.enrich for longterm in longterms]
    enrich_vectors = embedder.run(sentences=enrichs)
    combos = [longterm.combo for longterm in longterms]
    combo_vectors = embedder.run(sentences=combos)
    for longterm, vector in zip(longterms, raw_vectors):
        longterm.raw_embedding = vector
    for longterm, vector in zip(longterms, enrich_vectors):
        longterm.enrich_embedding = vector
    for longterm, vector in zip(longterms, combo_vectors):
        longterm.combo_embedding = vector
        longterm.updated_at = updated_at
    ltm.update_longterms(longterms=longterms, session=session)

In [10]:
from pathlib import Path
from tqdm import tqdm

source_dir = Path("./sources")
pdf_files = source_dir.glob("*.pdf")
olf = OfflineFlow()
errors = []

# target = 'Assisting in Writing Wikipedia-like Articles From Scratch with Large Language Models.pdf'
# target = 'AcuRank Uncertainty-Aware Adaptive Computation for Listwise Reranking.pdf'

for pdf_path in tqdm(pdf_files):
    source = pdf_path.name
    # if source == target:
    #     continue  # skip until we reach the target file

    source_type = pdf_path.suffix.lstrip(".")
    source_ops = SourceOptions(
        path=str(pdf_path),
        type=source_type
    )
    document = Document(source=source, type=source_type)
    document = dm.create_document(document, session=Depends(get_session))
    try:
        print(source)
        loader = PDFLoader(source=source_ops)
        contexts = loader.run()


        longterms = olf.run(document_id=document.id, contexts=contexts)
        ltm.create_raws(longterms, session=Depends(get_session))

        longterms = ltm.read_longterms_by_document(document_id=document.id, session=Depends(get_session))
        document_chunks = [{"document_id": document.id, "longterm_id": longterm.id} for longterm in longterms]

        response = run_stepfunctions(stateMachineArn=STATE_MACHINE_ARN, document_chunks=document_chunks)
        executionArn = response["executionArn"]
        check_stepfunctions_job(executionArn)

        longterms = ltm.read_longterms_by_document(document_id=document.id, session=Depends(get_session))
        embed(longterms, embedder=embedder, session=Depends(get_session))

    except Exception as e:
        errors.append((document.id, source, str(e)))

    # break  # ✅ stop the loop after processing the matched file


0it [00:00, ?it/s]

AcuRank Uncertainty-Aware Adaptive Computation for Listwise Reranking.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"520ee2e0-935a-4e34-bbf4-22413a6c0713","document_id":"af617836-a979-448c-bfcf-7d6385fd12a1"}},{"Payload":{"processed":"767c1125-8733-41e6-b39e-b47ad25bd317","document_id":"af617836-a979-448c-bfcf-7d6385fd12a1"}},{"Payload":{"processed":"ad9dfe6d-b76d-48a1-a7ee-35db37935df6","document_id":"af617836-a9

pre tokenize: 100%|██████████| 2/2 [00:00<00:00, 20.17it/s]
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Inference Embeddings: 100%|██████████| 2/2 [00:01<00:00,  1.80it/s]
pre tokenize: 100%|██████████| 2/2 [00:00<00:00, 95.09it/s]
Inference Embeddings: 100%|██████████| 2/2 [00:00<00:00,  3.47it/s]
pre tokenize: 100%|██████████| 2/2 [00:00<00:00, 22.34it/s]
Inference Embeddings: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it]
1it [01:32, 92.66s/it]

Assisting in Writing Wikipedia-like Articles From Scratch with Large Language Models.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"a8fbf264-d49d-4d60-9275-2973b44f1925","document_id":"06ad7bfc-20e0-40fb-a806-66c8f871e96a"}},{"Payload":{"processed":"573ecdf7-2ef4-49ba-998f-89656b3a7168","document_id":"06ad7bfc-20e0-40fb-a806-66c8f871e96a"}},{"Payload":{"processed":"ad98c1d5-b939-4273-a7da-d8e37d524543","document_id":"06ad7bfc-20e0-40fb-a806-66c8f871e96a"}},{"Payload":{"processed":"4cf9c34c-32c6-4054-a710-a86423022e8a","document_id":"06ad7bfc-20e0-40fb-a806-66c8f871e96a"}},{"Payload":{"processed":"da606404-f599-432d-ab7e-299f1221d3e4","document_id":"06ad7bfc-20e0-40fb-a806-66c8f871e96a"}},{"Payload":{"processed":"2446a323-6a0c-4e62-9417-bf3e14d0b92b","document_

2it [02:37, 76.02s/it]

ClueAnchor Clue-Anchored Knowledge Reasoning Exploration and Optimization for Retrieval-Augmented Generation.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"85d4608d-e8f4-40ce-bfca-f9b87ff8a7ad","document_id":"44316211-6f8d-463a-ab44-fad555690d5c"}},{"Payload":{"processed":"f7d4664c-5c46-4a73-b1cf-2229b0273108","document_id":"44316211-6f8d-463a-ab44-fad555690d5c"}},{"Payload":{"processed":"9e17a93d-c1e2-4cbd-8d10-c59a90fb2585","document_id":"44316211-6f8d-463a-ab44-fad555690d5c"}},{"Payload":{"processed":"c5477d5a-80ee-477f-b866-c510298c81d4","document_id":"44316211-6f8d-463a-ab44-fad555690d5c"}},{"Payload":{"processed":"ac1cede9-3

3it [04:10, 84.05s/it]

DiscoVLA Discrepancy Reduction in Vision, Language, and Alignment for Parameter-Efficient Video-Text Retrieval.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"437094d8-81b8-446f-b758-afd3f715c31e","document_id":"4cc494d9-fe5e-41b9-9b27-850b85a142bd"}},{"Payload":{"processed":"1dbf95e5-849f-4e16-8bb5-53923a4dfcea","document_id":"4cc494d9-fe5e-41b9-9b27-850b85a142bd"}},{"Payload":{"processed":"b58080c8-b07a-4463-80d9-704a78ed2967","document_id":"4cc494d9-fe5e-41b9-9b27-850b85a142bd"}},{"Payload":{"processed":"109280c8-6bb2-4326-921d-2f8535fc83dc","document_id":"4cc494d9-fe5e-41b9-9b27-850b85a142bd"}},{"Payload":{"processed":"867497f8-fb9d-40f7-9f33-10f0f74e540a","document_id":"4cc494d9-fe5e-41b9-9b27-850b85a142bd"}},{"Payload":{"processed":"4f09230f-f70d-41e1-9b67-f5479790df3c","document_id":"4cc494d9-fe5e-41b9-9b27-850b85a142bd"}},{"Payload":{"processed":"3357ac29

4it [04:51, 66.91s/it]

EXP4FUSE A RANK FUSION FRAMEWORK FOR ENHANCED SPARSE RETRIEVAL USING LARGE LANGUAGE MODEL-BASED QUERY EXPANSION.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"53ee620e-fb8c-491a-af28-a224c6461dd0","document_id":"98c7c09d-3cd0-43b5-9167-c1341a3e3b16"}},{"Payload":{"processed":"bc6682b1-c197-4292-8b68-739ea98eaff7","document_id":"98c7c09d-3cd0-43b5-9167-c1341a3e3b16"}},{"Payload":{"processed":"e9f1a515-308f-4bd6-998e-6b2bde9c0ec7","document_id":"98c7c09d-3cd0-43b5-9167-c1341a3e3b16"}},{"Payload":{"processed":"fb6f4327-193a-4107-8ddf-48b384b7bda5","document_id":"98c7c09d-3cd0-43b5-9167-c1341a3e3b16"}},{"Payload":{"processed":"8b90d490-4757-4ae2-a592-05a3e0114e8c","document_id":"98c7c09d-3cd0-43b5-9167-c1341a3e3b16"}},{"Payload":{"processed":"1b43054f-98e9-4669-82a

5it [05:33, 57.88s/it]

GainRAG Preference Alignment in Retrieval-Augmented Generation through Gain Signal Synthesis.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"35666a80-d5d2-4b77-add7-145d5b927026","document_id":"52034070-d130-4886-bb30-3c3125c961cb"}},{"Payload":{"processed":"e9119195-8a01-4aa7-a7f4-94aa53905c93","document_id":"52034070-d130-4886-bb30-3c3125c961cb"}},{"Payload":{"processed":"6c10c570-d586-4f70-b811-7c568a39b09f","document_id":"52034070-d130-4886-bb30-3c3125c961cb"}},{"Payload":{"processed":"f0207a8d-5281-4872-b371-417d5085c9ae","document_id":"52034070-d130-4886-bb30-3c3125c961cb"}},{"Payload":{"processed":"6993fc7d-e0fe-463b-b32d-561a7e1cec8c","document_id":"52034070-d130-4886-bb30-3c3125c961cb"}},{"Payload":{"processed":"3d3fa0e3-c234-4b9e-9646-ca94a7f70a8d","document_id":"52034070-d130-4886-bb30-3c3125c961cb"}},{"Payload":{"processed":"c3f25af5

6it [06:09, 50.50s/it]

GenKI Enhancing Open-Domain Question Answering with Knowledge Integration and Controllable Generation in Large Language Models.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"d64229be-0304-4129-b433-be11a723397b","document_id":"d402aeb9-3e82-4c2a-a9ba-f5bdda533b58"}},{"Payload":{"processed":"72524756-291f-485e-8aff-ed40f319296c","document_id":"d402aeb9-3e82-4c2a-a9ba-f5bdda533b58"}},{"Payload":{"processed":"66cc66fa-ed30-44b7-ad3b-21f4183a5fb6","document_id":"d402aeb9-3e82-4c2a-a9ba-f5bdda533b58"}},{"Payload":{"processed":"6eec1d07-852c-4ddb-8cbf-bb208f1037fa","document_id":"d402aeb9-3e82-4c2a-a9ba-f5bdda533b58"}},{"Payload":{"processed":"1c32e42f-cf67-4090-87a8-49e4580191bf","document_id":"d402aeb9-3e82-4c2a-a9ba-f5bdda533b58"}},{"Payload":{"processed":"33e65f7d-604a-43ae-ae97-69619da05572","document_id":"d402aeb9-3e82-4c2a-a9

7it [07:01, 51.13s/it]

HippoRAG Neurobiologically Inspired Long-Term Memory for Large Language Models.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"bc2db7fd-fb31-46ec-a169-de9b18d71bf4","document_id":"95517b4d-6753-415d-93b2-e31e21c26df2"}},{"Payload":{"processed":"f2480a9e-65e8-46f2-ae3c-b8a6a9764b43","document_id":"95517b4d-6753-415d-93b2-e31e21c26df2"}},{"Payload":{"processed":"bdfac23f-ee1b-4221-896a-d4cf8b24ea50","document_id":"95517b4d-6753-415d-93b2-e31e21c26df2"}},{"Payload":{"processed":"aaeb84bb-d265-40e6-881b-0512eb6918a7","document_id":"95517b4d-6753-415d-93b2-e31e21c26df2"}},{"Payload":{"processed":"54fd12be-58a9-4d41-80ee-9e7166cdd9d7","document_id":"95517b4d-6753-415d-93b2-e31e21c26df

8it [08:32, 63.85s/it]

LlamaRec-LKG-RAG A Single-Pass, Learnable Knowledge Graph-RAG Framework for LLM-Based Ranking.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(3)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"1e22c7a5-ce62-41fd-8976-b45b488faf31","document_id":"24f33744-2836-4c57-8108-465065b1f793"}},{"Payload":{"processed":"d0c16bd1-41b6-476c-af08-0b941b145f38","document_id":"24f33744-2836-4c57-8108-465065b1f793"}},{"Payload":{"processed":"2060c4fb-3f32-4831-a7b5-c63f39f0a3e4","document_id":"24f33744-2836-4c57-8108-465065b1f793"}},{"Payload":{"processed":"e404f3a7-d426-4c4b-b421-bcdac12008e1","document_id":"24f33744-2836-4c57-8108-465065b1f793"}},{"Payload":{"processed":"48e59167-1f1f-4a47-81d6-7232a4a2126f","document_id":"24f33744-2836-4c57-8108-465065b1f793"}},{"Payload":{"processed":"d8fc49c5-ddf1-48b6-b39d-a8743aad700b","document_id":"24f33744-2836-4c57-8108-465065b1f793"}},{"Payload":{"processed":"29eb209f

9it [09:20, 58.80s/it]

LOGICOL Logically-Informed Contrastive Learning for Set-based Dense Retrieval.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"f1dd17cc-d5f2-4e96-a76b-3ca5012eaca8","document_id":"f38e0305-f464-4954-9fb4-809765384259"}},{"Payload":{"processed":"1cf6e237-00d1-4833-aadc-874fc12fc88e","document_id":"f38e0305-f464-4954-9fb4-809765384259"}},{"Payload":{"processed":"9292797e-0656-48dd-9415-42dab93a36a8","document_id":"f38e0305-f464-4954-9fb4-809765384259"}},{"Payload":{"processed":"0a2b72c9-8812-4467-84e3-e5ccb8c74683","document_id":"f38e0305-f464-4954-9fb4-809765384259"}},{"Payload":{"processed":"fc13d02e-c8bd-4c84-a191-1c7e3a3b6040","document_id":"f38e0305-f464-4954-9fb4-809765384259"}},{"Payload":{"processed":"3a264e2d-7cf1-4c63-bcd0-ec25535a676c","document_id":"f38e0305-f464-4954-9fb4-809765384259"}},{"Payload":{"processed":"4f3d4cda

10it [09:53, 50.96s/it]

MASKSEARCH A Universal Pre-Training Framework to Enhance Agentic Search Capability.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"f9dd779f-f1c4-4b43-9c03-9abc167a68e7","document_id":"fc0a57fd-6c7e-464d-b819-5e2c31a84ef4"}},{"Payload":{"processed":"f637c731-ae3a-4997-8047-00b42f30bbb9","document_id":"fc0a57fd-6c7e-464d-b819-5e2c31a84ef4"}},{"Payload":{"processed":"eb8fa7b9-464a-49aa-a915-7f760d824ab8","document_id":"fc0a57fd-6c7e-464d-b819-5e2c31a84ef4"}},{"Payload":{"processed":"efd80655-97cd-4d87-b2b9-9271067ca84a","document_id":"fc0a57fd-6c7e-464d-b819-5e2c31a84ef4"}},{"Payload":{"processed":"3e7645aa-204c-4978-b10f-d349f8c0700d","document_id":"fc0a57fd-6c7e-464d-b819-5e2c31a84ef4"}},{"Payload":{"processed":"c16f169f-ac00-427c-b660-9d535143bb1c","document_

11it [11:05, 57.37s/it]

PAKTON A Multi-Agent Framework for Question Answering in Long Legal Agreements.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"67742a9e-bf16-44e1-9254-71a647aefe45","document_id":"b3bc7aa5-d5b9-4d20-8e35-9a208a8e8b0e"}},{"Payload":{"processed":"bb610827-1a60-4ee7-8be8-38d9bde92f96","document_id":"b3bc7aa5-d5b9-4d20-8e35-9a208a8e8b0e"}},{"Payload":{"processed":"0e32ca29-84b4-465c-ade5-c264918389dd","document_id":"b3bc7aa5-d5b9-4d20-8e35-9a208a8e8b0e"}},{"Payload":{"processed":"b023e221-bf6a-493e-92a2-9b7924db8c2a","document_id":"b3bc7aa5-d5b9-4d20-8e35-9a208a8e8b0e"}},{"Payload":{"processed":"3bbf4783-25b9-4649-8172-41c58da9d34a","document_id":"b3bc7aa5-

12it [12:54, 72.87s/it]

POQD Performance-Oriented Query Decomposer for Multi-vector retrieval.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"38e666e2-e462-46ca-bcdc-12133785d0f3","document_id":"18342d7e-0d2f-4b74-a1d9-0caadffddddf"}},{"Payload":{"processed":"71cf3397-e9dd-4d5b-8f56-dbe0b94e71bc","document_id":"18342d7e-0d2f-4b74-a1d9-0caadffddddf"}},{"Payload":{"processed":"aacd1624-9090-411c-9639-5a362cca4c06","document_id":"18342d7e-0d2f-4b74-a1d9-0caadffddddf"}},{"Payload":{"processed":"02bb3d24-3d7c-4b5a-8c4c-b39447daed80","document_id":"18342d7e-0d2f-4b74-a1d9-0caadffddddf"}},{"Payload":{"processed":"0b098d53-5811-4455-b070-169a32cac1ab","document_id":"18342d7e-0d2f-4b74-a1d9-0caadffddddf"}},{"Payload":{"processed":"5939de98-f5a2-428f-941a-ce92ccbda73e","document_id":"18342d7e-0d2f-4b74-a1d9-0caadffddddf"}},{"Paylo

13it [13:45, 66.23s/it]

RARE Retrieval-Aware Robustness Evaluation for Retrieval-Augmented Generation Systems.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(3)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"9800a940-69ee-41df-b915-1d9f3af9ee14","document_id":"3884cc52-33bd-43a2-aa39-8747a1b74257"}},{"Payload":{"processed":"1ac83152-88ed-4e6b-86a8-41a8b6dc8cfa","document_id":"3884cc52-33bd-43a2-aa39-8747a1b74257"}},{"Payload":{"processed":"3b88d513-d870-4489-b362-5acac841b986","document_id":"3884cc52-33bd-43a2-aa39-8747a1b74257"}},{"Payload":{"processed":"ca1528a3-0826-4015-82df-d2aa34d4d1d9","document_id":"3884cc52-33bd-43a2-aa39-8747a1b74257"}},{"Payload":{"processed":"5039ebde-5b5b-4e2e-bc62-7b1490c33592","document_id":"3884cc52-33bd-43a2-aa39-8747a1b74257"}},{"Payload":{"processed":"c439a73a-3782-453c-a508-840febfa0d45","document_

14it [14:39, 62.79s/it]

REARANK Reasoning Re-ranking Agent via Reinforcement Learning.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"c700541c-162c-4150-b779-725f736396a7","document_id":"e809f8ae-921d-4130-bafd-f074319b3ed6"}},{"Payload":{"processed":"a86ee887-5c52-491e-9af4-dc41add1ca03","document_id":"e809f8ae-921d-4130-bafd-f074319b3ed6"}},{"Payload":{"processed":"6277c578-9606-43ae-8db3-ebaaac690382","document_id":"e809f8ae-921d-4130-bafd-f074319b3ed6"}},{"Payload":{"processed":"7e549f3a-0fbd-44bb-ae2e-5f871821aa98","document_id":"e809f8ae-921d-4130-bafd-f074319b3ed6"}},{"Payload":{"processed":"53d407ac-6ca9-41d6-8351-b22edde1d4b7","document_id":"e809f8ae-921d-4130-bafd-f074319b3ed6"}},{"Payload":{"processed":"03e5c30c-6810-465e-8d95-a1e3f22d8e67","document_

15it [15:46, 63.82s/it]

SlideCoder Layout-aware RAG-enhanced Hierarchical Slide Generation from Design.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"86a4448f-6508-46fc-915d-90310e47e000","document_id":"cdbe2918-c993-42ff-9354-eb4ffc266de0"}},{"Payload":{"processed":"3f95384b-2f63-487d-addc-fc184a242ab4","document_id":"cdbe2918-c993-42ff-9354-eb4ffc266de0"}},{"Payload":{"processed":"3cb7e24e-f1f1-415c-ab0c-2e7d2eb4f2f9","document_id":"cdbe2918-c993-42ff-9354-eb4ffc266de0"}},{"Payload":{"processed":"4e0f1482-18b7-4db4-8081-79565d557e09","document_id":"cdbe2918-c993-42ff-9354-eb4ffc266de0"}},{"Payload":{"processed":"d23bfc23-7239-43dc-bb65-e1a0ea672ebf","document_id":"cdbe2918-c993-42ff-9354-eb4ffc266de0"}},{"Payload":{"processed":"98c02ca4-2737-423d-b907-ab30e66f6f71","document_id":"cdbe2918-c993-42ff-93

16it [16:35, 59.38s/it]

SORCE Small Object Retrieval in Complex Environments.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"88f2dbb4-2b1f-4f7d-ba0c-e2197510bca1","document_id":"1736cec1-9901-4634-ac31-2bac14137406"}},{"Payload":{"processed":"218683f5-f616-4690-82b1-390d52d8d76b","document_id":"1736cec1-9901-4634-ac31-2bac14137406"}},{"Payload":{"processed":"304f299d-923a-4f78-bacf-3124cff0e663","document_id":"1736cec1-9901-4634-ac31-2bac14137406"}},{"Payload":{"processed":"416f8deb-d8f1-4370-9aa6-5694591405de","document_id":"1736cec1-9901-4634-ac31-2bac14137406"}},{"Payload":{"processed":"9e25f7eb-040b-43d5-910a-37d37363bb95","document_id":"1736cec1-9901-4634-ac31-2bac14137406"}},{"Payload":{"processed":"dda5b231-84f2-4153-a391-6930e9d201fe","document_id":"1736cec1-9901-4634-ac31-2bac14137406"}},{"Paylo

17it [17:19, 54.82s/it]

TracLLM A Generic Framework for Attributing Long Context LLMs.pdf


  chunks = split_markdown(text)
  consolidated_chunks = consolidate_markdown(chunks)
  sections = get_markdown_sections(consolidated_chunks)
  new_contexts = split_overlap(contexts, max_tokens=max_tokens, overlap=overlap)


Markdown headings: max(2)
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: RUNNING
Execution status: SUCCEEDED
Execution output: {"status":"success","results":[{"Payload":{"processed":"b2330f8d-4abe-4fd7-99da-0a79d2a5c1aa","document_id":"ed281e35-924a-45e9-9fa8-d4238213437d"}},{"Payload":{"processed":"fa13257b-f14b-44a5-a65d-dbb3474ba725","document_id":"ed281e35-924a-45e9-9fa8-d4238213437d"}},{"Payload":{"processed":"a061661f-2365-4936-9388-6171775711bb","document_id":"ed281e35-924a-45e9-9fa8-d4238213437d"}},{"Payload":{"processed":"f33abbc1-02df-4990-9af4-6a5f9543f632","document_id":"ed281e35-924a-45e9-9fa8-d4238213437d"}},{"Payload":{"processed":"bc97e5d4-645f-4bf7-b3c6-570b3d7a6404","document_id":"ed281e35-924a-45e9-9fa8-d4238213437d"}},{"Payload":{"processed":"38757ba9-aeab-4d41-9137-b118d64a13a9","document_

18it [18:20, 61.13s/it]


In [12]:
errors

[]