In [None]:
from pathlib import Path

path = Path.cwd().parent/"tests"/"assets"/"lz.md"
with open(path) as f:
    text = f.read()

In [None]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
chunks = markdown_splitter.split_text(text)
for idx, chunk in enumerate(chunks):
    print("Idx:", idx, '============')
    print(chunk.page_content)
    print()

In [None]:
from langchain_text_splitters import ExperimentalMarkdownSyntaxTextSplitter
es = ExperimentalMarkdownSyntaxTextSplitter(strip_headers=False)
cs = es.split_text(text)
for idx, chunk in enumerate(cs):
    print(idx, '============')
    print(chunk.page_content)
    print()

In [None]:
from unstructured.partition.md import partition_md
items = partition_md(text=text, chunking_strategy="by_title", max_characters=5000)
for idx, chunk in enumerate(items):
    print(idx, '============')
    print(chunk.text)
    print()

## Be better than competitors in

- Respect Markdown syntax when easyparser.
- Can chunk based on Markdown syntaxes other than heading (e.g. HTML elements inside Markdown)
- Avoid too fragmented elements (risk of losing context when searching). We can still maintain hierarchy when combining elements.
- Annotate table, images, code block if necessary.

In [None]:
from easyparser.split.md import MarkdownSplitByHeading
from easyparser.mime import mime_md

root = mime_md.as_root_chunk(str(path))
root.content = text
chunks = MarkdownSplitByHeading.run(root, min_chunk_size=100)

for idx, chunk in enumerate(chunks):
    print(idx, '============', chunk.metadata)
    print(chunk.content)
    print()

In [None]:
print(chunks[13])