In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from dotenv import load_dotenv

load_dotenv()

In [None]:
import re

from pyprojroot import here

with open(here() / "data" / "curve-sim.md", "r+") as f:
    text = f.read()


# Remove all <!-- image --> from the text
text = re.sub(r"<!-- image -->", "", text)
# Remove all `\n`*3 or more from the text
text = re.sub(r"\n{3,}", "\n\n", text)

In [None]:
from chonkie import SDPMChunker

chunker = SDPMChunker(min_sentences=10, threshold=0.4)
chunks = chunker(text)
len(chunks)

In [None]:
from poorman_graphrag.index import GraphRAGIndex

index = GraphRAGIndex()
doc_hash = index.add_document(text)

for chunk in chunks:
    index.add_chunk(doc_hash, chunk.text)

In [None]:
import llamabot as lmb

from poorman_graphrag.relationships import Relationships

bot = lmb.StructuredBot(
    system_prompt="""You are an expert at extracting relationships
    between entities in text.
    Given a chunk of text, identify relationships
    between entities mentioned in the text.
    """,
    pydantic_model=Relationships,
)

In [None]:
from tqdm.auto import tqdm

for chunk_hash, chunk_text in tqdm(index.chunk_index.items()):
    relationships = bot(chunk_text)
    index.add_relations(chunk_hash, relationships)

In [None]:
index.save(here() / "data" / "index.json")

In [None]:
len(index.entity_index)