In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from dotenv import load_dotenv

load_dotenv()

In [None]:
from pyprojroot import here

from poorman_graphrag.index import GraphRAGIndex

index = GraphRAGIndex.load(here() / "data" / "index.json")
index.entities

In [None]:
from poorman_graphrag.entities import identify_exact_duplicates

identify_exact_duplicates(index.entities)

In [None]:
from poorman_graphrag.entities import identify_levenshtein_similar

similar_entities = identify_levenshtein_similar(index.entities)

In [None]:
similar_entities

In [None]:
import llamabot as lmb
from pydantic import BaseModel, Field

from poorman_graphrag.entities import Entity


class IsSameEntity(BaseModel):
    is_same_entity: bool = Field(
        description="Whether the entities are semantically the same entity"
    )
    reason: str = Field(description="The reason for the answer")

    def __bool__(self):
        return self.is_same_entity


@lmb.prompt("user")
def is_same_entity(entities) -> IsSameEntity:
    """Here are the entities:

    {% for entity in entities %}- {{ entity.entity_type }}: {{ entity.name }}
    {% endfor %}
    """


@lmb.prompt("user")
def examples(same_entities: list[Entity], different_entities: list[Entity]) -> str:
    """Examples of entities that are the same entity:

    {% for entity in same_entities %}- {{ entity.entity_type }}: {{ entity.name }}
    {% endfor %}

    Examples of entities that are not the same entity:

    {% for entity in different_entities %}- {{ entity.entity_type }}: {{ entity.name }}
    {% endfor %}
    """


in_context_examples = lmb.user(
    examples(
        same_entities=[
            *similar_entities[("paper", "novick et al. 2012")],
        ],
        different_entities=[
            *similar_entities[("metric", "type i error")],
        ],
    )
)

In [None]:
# I will note that the use of in context learning here is pretty powerful.


In [None]:
print(is_same_entity(list(similar_entities.values())[0]).content)

In [None]:
same_entity_judge = lmb.StructuredBot(
    system_prompt=lmb.system(
        "You are a judge of whether two entities in a knowledge graph "
        "are similar enough to be considered the same entity. "
    ),
    pydantic_model=IsSameEntity,
    model_name="gpt-4o",
)


entity_groups_to_deduplicate = {}
for entity_type, entities in similar_entities.items():
    result = same_entity_judge(is_same_entity(entities))
    if result:
        entity_groups_to_deduplicate[entity_type] = entities

entity_groups_to_deduplicate

In [None]:
deduplicated_index = index.deduplicate_entities(entity_groups_to_deduplicate)

In [None]:
len(deduplicated_index.entities), len(index.entities)

In [None]:
deduplicated_index.save(here() / "data" / "deduplicated_index.json")