In [None]:
%load_ext autoreload
%autoreload 2

from pyprojroot import here

from poorman_graphrag.index import GraphRAGIndex

index = GraphRAGIndex.load(here() / "data" / "deduplicated_index_with_communities.json")

In [None]:
import json

import llamabot as lmb
import networkx as nx
from pydantic import BaseModel, Field

from poorman_graphrag.network import build_network


class EnhancedQuery(BaseModel):
    queries: list[str] = Field(
        description="A list of strings for which to use vector (semantic) search."
    )
    keywords: list[str] = Field(
        description="A list of strings for which to use keyword search."
    )


def query_enhancer(query: str, index: GraphRAGIndex, top_k: int = 10) -> EnhancedQuery:
    enhancerbot = lmb.StructuredBot(
        system_prompt=(
            "You are a helpful assistant that takes in a user's query, "
            "which may be generic, incomplete, vague, "
            "or otherwise not usefully formulated, "
            "and returns a list of queries "
            "that are more specific, complete, and usefully formulated."
        ),
        pydantic_model=EnhancedQuery,
        model_name="gpt-4o",
    )

    G = build_network(index)

    # Compute the betweenness centrality and degree centrality of the nodes in the graph
    betweenness_centrality = nx.betweenness_centrality(G)
    degree_centrality = nx.degree_centrality(G)

    # Get the top 10 nodes by betweenness centrality and degree centrality
    top_betweenness = sorted(
        betweenness_centrality.items(), key=lambda x: x[1], reverse=True
    )[:top_k]
    top_degree = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[
        :top_k
    ]

    # Now, collect their metadata dictionaries into a long JSON string
    # and pass it to the LLM.
    metadata = [G.nodes[node[0]] for node in top_betweenness + top_degree]
    metadata_str = json.dumps(metadata)

    return enhancerbot(
        lmb.user(
            f"This is the original query: {query}",
            "These are the top 10 nodes by ",
            "betweenness centrality and degree centrality, "
            f"and their metadata: {metadata_str}",
        )
    )


enhanced_query = query_enhancer("Tell me about the study.", index)

In [None]:
keyword_docstore = lmb.BM25DocStore()
keyword_docstore.extend(index.chunk_index.values())
keyword_docstore.extend(index.doc_index.values())
keyword_docstore.extend([entity.summary for entity in index.entity_index.values()])

results = keyword_docstore.retrieve("hypothesis")
[len(result) for result in results]

In [None]:
vector_docstore = lmb.LanceDBDocStore(table_name="dummy-1038vkch")
vector_docstore.extend(index.chunk_index.values())
vector_docstore.extend(index.doc_index.values())
vector_docstore.extend([entity.summary for entity in index.entity_index.values()])

results = vector_docstore.retrieve("hypothesis")
[len(result) for result in results]

In [None]:
results