In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path

asset_folder = Path.cwd().parent / "tests" / "assets"
pdf_file = asset_folder / "long.pdf"

### Loading and chunking

In [None]:
from easyparser.load.pdf import DoclingPDF
from easyparser.mime import mime_pdf

# Construct root chunk
pdf_node = mime_pdf.as_root_chunk(pdf_file)

# Chunk from root to smaller chunks
chunks = DoclingPDF.run(pdf_node)

In [None]:
print(pdf_node.as_dict())

In [None]:
print(f"There are {len(chunks)} chunks")
print("Example:")
print(chunks[0])
print(chunks[1])
print(chunks[2])

### History tracing

Know what operations are applied to a chunk.

In [None]:
print(chunks[0].history)

In [None]:
from easyparser.chunk.text import ChunkByCharacters

chunked_texts = ChunkByCharacters.run(chunks, chunk_size=100, chunk_overlap=20)
print(f"Chunked from {len(chunks)} chunks to {len(chunked_texts)} chunks")

In [None]:
for idx, c in enumerate(chunked_texts):
    if len(c.history) > 1:
        print(idx, c.history)
        break

### Navigate

Starting from a chunk, it's possible to navigate to the surrounding chunks. This function will be important when we want to equip multi-turn retrieval capability to agent.

In [None]:
c = chunks[100]
print("Random chunk:")
print(c)
print()
print("Next chunk:")
print(c.next)
print()
print("Prev chunk:")
print(c.prev)
print()
print("Parent chunk:")
print(c.parent)

## Persist

`easyparser` supports storing and loading chunks for later use.

In [None]:
from easyparser.store.fs import FileStore

store = FileStore("path")
store.save_group(chunks)
store.save(pdf_node)

#### Loading

In [None]:
store2 = FileStore("path")

In [None]:
c = store2.get("6e4d45d14dc84193800e4a5a2b9bf48e")

In [None]:
c.text

In [None]:
print(c.next.next.next)

In [None]:
print(c.parent)

In [None]:
print(c.history)