In [6]:
import json
import pickle

import numpy as np

from typing import Any, Dict, List, Literal, Optional

from llm_foundation import logger
from llm_foundation.agent_types import Persona, Role

from crewai import Agent, Task, Crew
from crewai.crews import CrewOutput
from hackathon.index import generate_entity_embeddings, create_index, search_index, build_similar_entities
from hackathon.input_output_types import NamedEntities
from hackathon.retrieval_neo4j import chunk_ranker, retrieve_context, retrieve_similar_entities, pagerank
from hackathon.graph_neo4j import add_entities, add_relates_to_relationships, build_vector_index, add_similar_entities, clean_db, Neo4jClientFactory
from hackathon.tools import filter_named_entities, create_document_deduped_entities_dict, create_matrix_entity_ref_count
from hackathon.utils import build_document_structure, save_document_structure
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.output_parsers.json import SimpleJsonOutputParser
from rich import print
from rich.pretty import pprint


from dotenv import load_dotenv

load_dotenv()


True

In [7]:
CHUNK_SIZE = 1000
CHUNK_LIMIT = 15  # 10 chunks for testing. -1 for all chunks
CHAR_OVERLAP = 200
document_name = "../2405.14831v1.pdf"
llm = "gpt-4o-mini"

# Create the Structure of the Document

In [8]:
# The chunk limit is set to 10 for testing purposes. Set it to -1 to process all chunks.
document_chunks = build_document_structure(document_name, chunk_size=CHUNK_SIZE, char_overlap=CHAR_OVERLAP, chunk_limit=CHUNK_LIMIT)

2024-11-07 21:43:50 R2GWRJJGF9 root[60486] INFO --------------------------------------------------------------------------------
2024-11-07 21:43:50 R2GWRJJGF9 root[60486] INFO Number of chunks: 101
2024-11-07 21:43:50 R2GWRJJGF9 root[60486] INFO --------------------------------------------------------------------------------


A document consist of a List of Chunks
Each Chunk is initially a dictionary with the following elements

```python
{
    id: int,
    text: str
}
```

In [9]:
document_chunks[1]

{'id': 1,
 'text': 'theory of human long-term memory to enable deeper and more efficient knowledge\nintegration over new experiences. HippoRAG synergistically orchestrates LLMs,\nknowledge graphs, and the Personalized PageRank algorithm to mimic the different\nroles of neocortex and hippocampus in human memory. We compare HippoRAG\nwith existing RAG methods on multi-hop question answering and show that our\nmethod outperforms the state-of-the-art methods remarkably, by up to 20%. Single-\nstep retrieval with HippoRAG achieves comparable or better performance than\niterative retrieval like IRCoT while being 10-30times cheaper and 6-13times faster,\nand integrating HippoRAG into IRCoT brings further substantial gains. Finally,\nwe show that our method can tackle new types of scenarios that are out of reach of\nexisting methods.1\n1 Introduction\nMillions of years of evolution have led mammalian brains to develop the crucial ability to store large'}

# Extract name entities from each chunk

In [10]:
extract_entities_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", """Your task is to extract entities from the given paragraph, in the same language as the paragraph.
Respond with a JSON list of entities."""),
        ("human", """Paragraph:
```
Radio City
Radio City is India's first private FM radio station and was started on 3 July 2001.
It plays Hindi, English and regional songs.
Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features.
```"""),
        ("ai", """{{"entities":
    ["Radio City", "India", "3 July 2001", "Hindi", "English", "May 2008", "PlanetRadiocity.com"]
}}"""),
        ("human", """Paragraph:```
{passage_text}
```"""),
    ]
)

extract_triplets_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", """Your task is to construct an RDF (Resource Description Framework) graph from the given passages and entity lists. 
Respond with a JSON list of triples, with each triple representing a relationship in the RDF graph. 

Pay attention to the following requirements:
- Each triple should contain at least one, but preferably two, of the named entities in the list for each passage.
- Clearly resolve pronouns to their specific names to maintain clarity.
"""),
        ("human", """Convert the paragraph into a JSON dict, it has a named entity list and a triple list.
Paragraph:
```
Radio City
Radio City is India's first private FM radio station and was started on 3 July 2001.
It plays Hindi, English and regional songs.
Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features.
```

{{"entities":
    ["Radio City", "India", "3 July 2001", "Hindi", "English", "May 2008", "PlanetRadiocity.com"]
}}"""),
        ("ai", """{{"triples": [
            ["Radio City", "located in", "India"],
            ["Radio City", "is", "private FM radio station"],
            ["Radio City", "started on", "3 July 2001"],
            ["Radio City", "plays songs in", "Hindi"],
            ["Radio City", "plays songs in", "English"]
            ["Radio City", "forayed into", "New Media"],
            ["Radio City", "launched", "PlanetRadiocity.com"],
            ["PlanetRadiocity.com", "launched in", "May 2008"],
            ["PlanetRadiocity.com", "is", "music portal"],
            ["PlanetRadiocity.com", "offers", "news"],
            ["PlanetRadiocity.com", "offers", "videos"],
            ["PlanetRadiocity.com", "offers", "songs"]
    ]
}}"""),
        ("human", """Convert the paragraph into a JSON dict, it has a entity list and a triple list.
Paragraph:
```
{passage_text}
```

{entities}"""),
    ]
)

In [11]:
def extend_document_chunks_with_entities_and_triples(llm_model, document_chunks: List[Dict]) -> List[Dict]:

    for chunk in document_chunks:
        chunk["named_entities"] =[]
        chunk["triples"] = []
        try:
            json_output_parser = SimpleJsonOutputParser()
            chain_entities = extract_entities_prompt | ChatOpenAI(model=llm_model, temperature=0.0) | json_output_parser
            named_entities = chain_entities.invoke({"passage_text": chunk["text"]})
            chunk["named_entities"] = named_entities["entities"]

            chain_triples = extract_triplets_prompt | ChatOpenAI(model=llm_model, temperature=0.0) | json_output_parser
            triples = chain_triples.invoke({"passage_text": chunk["text"], "entities": named_entities})
            chunk["triples"] = triples["triples"]
        except Exception as e:
            print(f"Error processing passage: {e}")
            continue
    
    return document_chunks

In [12]:
document_chunks_with_entities_and_triples = extend_document_chunks_with_entities_and_triples(llm, document_chunks)

2024-11-07 21:43:52 R2GWRJJGF9 httpx[60486] INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-07 21:43:55 R2GWRJJGF9 httpx[60486] INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-07 21:43:57 R2GWRJJGF9 httpx[60486] INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-07 21:43:59 R2GWRJJGF9 httpx[60486] INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-07 21:44:01 R2GWRJJGF9 httpx[60486] INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-07 21:44:04 R2GWRJJGF9 httpx[60486] INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-07 21:44:05 R2GWRJJGF9 httpx[60486] INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-07 21:44:07 R2GWRJJGF9 httpx[60486] INFO HTTP Request: POST https://api.openai.com/v1/cha

Now each Chunk is has also in the dictionary named entities and triples

```python
{
    id: int,
    text: str,
    named_entities: List[str]
}
```

In [13]:
document_chunks_with_entities_and_triples[1]

{'id': 1,
 'text': 'theory of human long-term memory to enable deeper and more efficient knowledge\nintegration over new experiences. HippoRAG synergistically orchestrates LLMs,\nknowledge graphs, and the Personalized PageRank algorithm to mimic the different\nroles of neocortex and hippocampus in human memory. We compare HippoRAG\nwith existing RAG methods on multi-hop question answering and show that our\nmethod outperforms the state-of-the-art methods remarkably, by up to 20%. Single-\nstep retrieval with HippoRAG achieves comparable or better performance than\niterative retrieval like IRCoT while being 10-30times cheaper and 6-13times faster,\nand integrating HippoRAG into IRCoT brings further substantial gains. Finally,\nwe show that our method can tackle new types of scenarios that are out of reach of\nexisting methods.1\n1 Introduction\nMillions of years of evolution have led mammalian brains to develop the crucial ability to store large',
 'named_entities': ['HippoRAG',
  'LLMs

## Save Document Structure

In [14]:
document_structure_file = f"{document_name.rsplit(".", 1)[0]}_document_structure.pkl"
save_document_structure(document_chunks_with_entities_and_triples, document_structure_file)

2024-11-07 21:44:47 R2GWRJJGF9 root[60486] INFO Saving document structure to ../2405.14831v1_document_structure.pkl


# Filter Named entities

In [15]:
document_chunks_with_entities_and_triples = filter_named_entities(document_chunks_with_entities_and_triples)

2024-11-07 21:44:47 R2GWRJJGF9 root[60486] INFO Initial Named Entities (11): ['hipporag', 'neurobiologically inspired', 'long-term memory', 'large language models', 'bernal jiménez gutiérrez', 'the ohio state university', 'yiheng shu', 'yu gu', 'michihiro yasunaga', 'stanford university', 'yu su']
2024-11-07 21:44:47 R2GWRJJGF9 root[60486] INFO Initial Named Entities after dedup (11): {'yu gu', 'hipporag', 'yiheng shu', 'stanford university', 'the ohio state university', 'michihiro yasunaga', 'bernal jiménez gutiérrez', 'neurobiologically inspired', 'long-term memory', 'large language models', 'yu su'}
2024-11-07 21:44:47 R2GWRJJGF9 root[60486] INFO Final Named Entities (11): {'yu gu', 'hipporag', 'yiheng shu', 'stanford university', 'the ohio state university', 'michihiro yasunaga', 'bernal jiménez gutiérrez', 'neurobiologically inspired', 'long-term memory', 'large language models', 'yu su'}
2024-11-07 21:44:47 R2GWRJJGF9 root[60486] INFO Initial Named Entities (8): ['hipporag', 'llm

## Save Document Structure after Filter Named Entities

In [16]:
document_structure_file_with_ne = f"{document_name.rsplit(".", 1)[0]}_document_structure_with_ne.pkl"
save_document_structure(document_chunks_with_entities_and_triples, document_structure_file_with_ne)

2024-11-07 21:44:47 R2GWRJJGF9 root[60486] INFO Saving document structure to ../2405.14831v1_document_structure_with_ne.pkl


In [17]:
document_chunks_with_entities_and_triples[1]

{'id': 1,
 'text': 'theory of human long-term memory to enable deeper and more efficient knowledge\nintegration over new experiences. HippoRAG synergistically orchestrates LLMs,\nknowledge graphs, and the Personalized PageRank algorithm to mimic the different\nroles of neocortex and hippocampus in human memory. We compare HippoRAG\nwith existing RAG methods on multi-hop question answering and show that our\nmethod outperforms the state-of-the-art methods remarkably, by up to 20%. Single-\nstep retrieval with HippoRAG achieves comparable or better performance than\niterative retrieval like IRCoT while being 10-30times cheaper and 6-13times faster,\nand integrating HippoRAG into IRCoT brings further substantial gains. Finally,\nwe show that our method can tackle new types of scenarios that are out of reach of\nexisting methods.1\n1 Introduction\nMillions of years of evolution have led mammalian brains to develop the crucial ability to store large',
 'named_entities': ['hippocampus',
  's

# Dedup Entities

In [18]:
entity2uid_dict = create_document_deduped_entities_dict(document_chunks_with_entities_and_triples)

In [19]:
entity2uid_dict

{'yu gu': 0,
 'hipporag': 1,
 'yiheng shu': 2,
 'stanford university': 3,
 'the ohio state university': 4,
 'michihiro yasunaga': 5,
 'bernal jiménez gutiérrez': 6,
 'neurobiologically inspired': 7,
 'long-term memory': 8,
 'large language models': 9,
 'yu su': 10,
 'hippocampus': 11,
 'state-of-the-art methods': 12,
 'more efficient than iterative retrieval': 13,
 'ircot': 14,
 'comparable performance to ircot': 15,
 'new types of scenarios': 16,
 'knowledge graphs': 17,
 'personalized pagerank': 18,
 'multi-hop question answering': 19,
 'rag': 20,
 'llms': 21,
 'neocortex': 22,
 'rag methods': 23,
 'retrieval-augmented generation': 24,
 'passage boundaries': 25,
 'current rag methods': 26,
 'code': 27,
 'data': 28,
 'https://github.com/osu-nlp-group/hipporag': 29,
 'humans': 30,
 'llms perform tasks': 31,
 'world knowledge': 32,
 'ai systems': 33,
 'knowledge': 34,
 'continuously updating long-term memory': 35,
 'new knowledge': 36,
 'vast stores of knowledge': 37,
 'for ai systems':

## Save the entity to uid dict to a file

In [20]:
with open(f"{document_name.rsplit(".", 1)[0]}_entity2uid_dict.pkl", "wb") as f:
    pickle.dump(entity2uid_dict, f)
logger.info(f"entity2uid_dict has been saved to {document_name.rsplit(".", 1)[0]}_entity2uid_dict.pkl")

2024-11-07 21:44:47 R2GWRJJGF9 root[60486] INFO entity2uid_dict has been saved to ../2405.14831v1_entity2uid_dict.pkl


# Matrix Creation

This matrix is important for pagerank below. Each row is an entity id and each column represents a chunk. The contents of each cell is the number of references to the entity in that paragraph.

In [21]:
string = """manipulatable, likely higher-level, features, which are then routed through the parahippocampal
regions (phr) to be indexed by the hippocampus. when they reach the hippocampus , salient signals
are included in the hippocampal index and associated with each other."""

string.count("parahippocampal\nregions")

1

In [22]:
matrix = create_matrix_entity_ref_count(document_chunks_with_entities_and_triples, entity2uid_dict)

print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
np.set_printoptions(threshold=np.inf)
pprint(matrix.shape)
pprint(matrix)
print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")

#### Detail the name of the entity that is referenced and its count in each chunk

In [23]:
example_chunk = document_chunks_with_entities_and_triples[0]

n_of_entities = len(entity2uid_dict)
n_of_chunks = len(document_chunks_with_entities_and_triples)

for e_idx in range(n_of_entities):
    entity_name = list(entity2uid_dict.keys())[list(entity2uid_dict.values()).index(e_idx)]
    logger.info(f"Entity: {e_idx} {entity_name} Chunk count: {matrix[e_idx][:]}")


2024-11-07 21:44:47 R2GWRJJGF9 root[60486] INFO Entity: 0 yu gu Chunk count: [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
2024-11-07 21:44:47 R2GWRJJGF9 root[60486] INFO Entity: 1 hipporag Chunk count: [2. 4. 0. 0. 1. 2. 2. 2. 0. 2. 0. 0. 1. 0. 0.]
2024-11-07 21:44:47 R2GWRJJGF9 root[60486] INFO Entity: 2 yiheng shu Chunk count: [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
2024-11-07 21:44:47 R2GWRJJGF9 root[60486] INFO Entity: 3 stanford university Chunk count: [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
2024-11-07 21:44:47 R2GWRJJGF9 root[60486] INFO Entity: 4 the ohio state university Chunk count: [4. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
2024-11-07 21:44:47 R2GWRJJGF9 root[60486] INFO Entity: 5 michihiro yasunaga Chunk count: [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
2024-11-07 21:44:47 R2GWRJJGF9 root[60486] INFO Entity: 6 bernal jiménez gutiérrez Chunk count: [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
2024-11-07 21:44:47 R2GWRJJGF9 root[60486] INFO Entity: 7 

## Save the matrix to a file

In [24]:
with open(f"{document_name.rsplit(".", 1)[0]}_entity_per_chunk_count_matrix.pkl", "wb") as f:
    pickle.dump(matrix, f)
logger.info(f"Entity per chunk count matrix has been saved to {document_name.rsplit(".", 1)[0]}_entity_per_chunk_count_matrix.pkl")

2024-11-07 21:44:47 R2GWRJJGF9 root[60486] INFO Entity per chunk count matrix has been saved to ../2405.14831v1_entity_per_chunk_count_matrix.pkl


### Load the Matrix

In [25]:
matrix = pickle.loads(open(f"{document_name.rsplit(".", 1)[0]}_entity_per_chunk_count_matrix.pkl", "rb").read())
matrix

array([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.],
       [ 2.,  4.,  0.,  0.,  1.,  2.,  2.,  2.,  0.,  2.,  0.,  0.,  1.,
         0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.],
       [ 4.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.],
       [ 1.,  0.,  3.,  0.,  0.,  0.,  0.,  2.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.],
       [ 2.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
       

# Index Creation

In [26]:
# Embeddings and FAISS index params
emb_dimension = 256
recall_at_k = 3  # how far in the indices/distances we go

# M_max defines the maximum number of links a vertex can have, and M_max0, which defines the same but for vertices in layer 0.
M = 64  # for HNSW index, the number of neighbors we add to each vertex on insertion. 
# Faiss sets M_max and M_max0 automatically in the set_default_probas method, at index initialization. 
# The M_max value is set to M, and M_max0 set to M*2

## Generating embeddings for named entities in document


In [27]:
named_entities_dict = pickle.loads(open(f"{document_name.rsplit(".", 1)[0]}_entity2uid_dict.pkl", "rb").read())
logger.info(f"Named entities dict loaded: {named_entities_dict}")
entities = list(named_entities_dict.keys())
logger.info(f"Number of entities: {len(entities)}. First entity is: {entities[0]}")

2024-11-07 21:44:47 R2GWRJJGF9 root[60486] INFO Named entities dict loaded: {'yu gu': 0, 'hipporag': 1, 'yiheng shu': 2, 'stanford university': 3, 'the ohio state university': 4, 'michihiro yasunaga': 5, 'bernal jiménez gutiérrez': 6, 'neurobiologically inspired': 7, 'long-term memory': 8, 'large language models': 9, 'yu su': 10, 'hippocampus': 11, 'state-of-the-art methods': 12, 'more efficient than iterative retrieval': 13, 'ircot': 14, 'comparable performance to ircot': 15, 'new types of scenarios': 16, 'knowledge graphs': 17, 'personalized pagerank': 18, 'multi-hop question answering': 19, 'rag': 20, 'llms': 21, 'neocortex': 22, 'rag methods': 23, 'retrieval-augmented generation': 24, 'passage boundaries': 25, 'current rag methods': 26, 'code': 27, 'data': 28, 'https://github.com/osu-nlp-group/hipporag': 29, 'humans': 30, 'llms perform tasks': 31, 'world knowledge': 32, 'ai systems': 33, 'knowledge': 34, 'continuously updating long-term memory': 35, 'new knowledge': 36, 'vast store

### Generate (and save) entity embeddings and convert them to np


In [28]:
embeddings_filepath = f"{document_name.rsplit('.', 1)[0]}_entity_embeddings.pkl"

In [29]:

entities_embeddings = generate_entity_embeddings(entities, emb_dimension, embeddings_filepath)


2024-11-07 21:44:48 R2GWRJJGF9 httpx[60486] INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


### Checkpoint Step: Load the entity Embeddings (Just to continue from here)

In [30]:
entities_embeddings = pickle.loads(open(embeddings_filepath, "rb").read())
entities_embeddings = np.array(entities_embeddings)

## Create Index and Query It with the same elements we indexed it

In [31]:
faiss_index = create_index(entities_embeddings, emb_dimension, M)
distances, indexes = search_index(faiss_index, entities_embeddings, recall_at_k)

2024-11-07 21:44:48 R2GWRJJGF9 root[60486] INFO 
Distances:
[[0.    0.799 0.927]
 [0.    0.255 0.424]
 [0.    0.927 0.932]
 [0.    0.231 0.632]
 [0.    0.985 1.253]
 [0.    1.063 1.274]
 [0.    1.298 1.317]
 [0.    1.018 1.02 ]
 [0.    0.345 0.472]
 [0.    1.12  1.128]
 [0.    0.799 0.932]
 [0.    0.591 0.753]
 [0.    1.085 1.115]
 [0.    0.417 0.629]
 [0.    0.671 1.101]
 [0.    0.671 1.093]
 [0.    1.12  1.147]
 [0.    0.136 0.468]
 [0.    0.509 1.08 ]
 [0.    0.466 0.478]
 [0.    0.991 1.055]
 [0.    0.414 0.449]
 [0.    0.442 0.605]
 [0.    0.678 0.923]
 [0.    0.69  0.878]
 [0.    0.765 0.936]
 [0.    0.363 0.678]
 [0.    1.043 1.066]
 [0.    1.032 1.066]
 [0.    0.689 0.824]
 [0.    1.113 1.157]
 [0.    0.449 0.797]
 [0.    0.75  0.784]
 [0.    0.512 0.972]
 [0.    0.716 0.739]
 [0.    0.516 0.66 ]
 [0.    0.43  0.716]
 [0.    0.856 0.914]
 [0.    0.512 1.097]
 [0.    0.717 0.986]
 [0.    0.274 0.72 ]
 [0.    1.12  1.135]
 [0.    1.344 1.357]
 [0.    0.363 0.923]
 [0.    1.109 1.

## Build Similar Entities with Recall at K

In [32]:
similar_entities = build_similar_entities(entities, indexes, distances, recall_at_k, max_distance=0.85)  # Original max_distance=0.7
logger.info(f"Similar entities:\n{similar_entities}")

# TODO Scores discarded for now
# scores = calculate_scores(distances)
# similar_entities_score = build_similar_entities_with_scores(entities, indexes, scores, recall_at_k, min_score=0.5)            
# logger.info(similar_entities_score)

2024-11-07 21:44:48 R2GWRJJGF9 root[60486] INFO Similarity (<=0.85 dist) found for yu gu (0) with yu su (10): Distance 0.7985775470733643
2024-11-07 21:44:48 R2GWRJJGF9 root[60486] INFO Similarity (<=0.85 dist) found for hipporag (1) with hipporag process (94): Distance 0.2549542486667633
2024-11-07 21:44:48 R2GWRJJGF9 root[60486] INFO Similarity (<=0.85 dist) found for hipporag (1) with hipporag methodology (141): Distance 0.42426562309265137
2024-11-07 21:44:48 R2GWRJJGF9 root[60486] INFO Similarity (<=0.85 dist) found for stanford university (3) with stanford (48): Distance 0.23052369058132172
2024-11-07 21:44:48 R2GWRJJGF9 root[60486] INFO Similarity (<=0.85 dist) found for stanford university (3) with stanfordner (122): Distance 0.6321331262588501
2024-11-07 21:44:48 R2GWRJJGF9 root[60486] INFO Similarity (<=0.85 dist) found for long-term memory (8) with human long-term memory (67): Distance 0.3448185920715332
2024-11-07 21:44:48 R2GWRJJGF9 root[60486] INFO Similarity (<=0.85 dist

## Create Neo4J Graph

In [33]:
neo4j_factory = Neo4jClientFactory()

Required to create the graph:
- entity_embeddings
- named entities dict 
- doc structure
- similar entities

### Step 1: Add all entities to the graph

In [34]:
clean_db(neo4j_factory)

2024-11-07 21:45:12 R2GWRJJGF9 root[60486] INFO Cleaning Neo4j database
2024-11-07 21:45:13 R2GWRJJGF9 root[60486] INFO Result after deleting nodes:
[]
2024-11-07 21:45:14 R2GWRJJGF9 root[60486] INFO Result after properties:
[]


In [35]:
len(named_entities_dict), len(entities_embeddings)

(173, 173)

In [36]:
add_entities(neo4j_factory, entities_embeddings, named_entities_dict)

### Step 2: Add RELATES_TO relationships

In [37]:
doc_structure = pickle.loads(open(f"{document_name.rsplit(".", 1)[0]}_document_structure_with_ne.pkl", "rb").read())
doc_structure

[{'id': 0,
  'text': 'HippoRAG: Neurobiologically Inspired\nLong-Term Memory for Large Language Models\nBernal Jiménez Gutiérrez\nThe Ohio State University\njimenezgutierrez.1@osu.eduYiheng Shu\nThe Ohio State University\nshu.251@osu.edu\nYu Gu\nThe Ohio State University\ngu.826@osu.eduMichihiro Yasunaga\nStanford University\nmyasu@cs.stanford.eduYu Su\nThe Ohio State University\nsu.809@osu.edu\nAbstract\nIn order to thrive in hostile and ever-changing natural environments, mammalian\nbrains evolved to store large amounts of knowledge about the world and continually\nintegrate new information while avoiding catastrophic forgetting. Despite the\nimpressive accomplishments, large language models (LLMs), even with retrieval-\naugmented generation (RAG), still struggle to efficiently and effectively integrate\na large amount of new experiences after pre-training. In this work, we introduce\nHippoRAG, a novel retrieval framework inspired by the hippocampal indexing',
  'named_entities': ['y

In [38]:
add_relates_to_relationships(neo4j_factory, doc_structure)


2024-11-07 21:45:47 R2GWRJJGF9 neo4j.notifications[60486] INFO Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Statement.CartesianProduct} {category: PERFORMANCE} {title: This query builds a cartesian product between disconnected patterns.} {description: If a part of a query contains multiple disconnected patterns, this will build a cartesian product between all those parts. This may produce a large amount of data and slow down query processing. While occasionally intended, it may often be possible to reformulate the query that avoids the use of this cross product, perhaps by adding a relationship between the different parts or by using OPTIONAL MATCH (identifier is: (b))} {position: line: 3, column: 5, offset: 37} for query: '\n    UNWIND $triplets AS triplet\n    MATCH (a:Entity {name: triplet.subject}), (b:Entity {name: triplet.object})\n    MERGE (a)-[:RELATES_TO {type: triplet.predicate}]->(b)\n    '


### Step 3: Add SIMILAR_TO relationships

In [40]:
add_similar_entities(neo4j_factory, similar_entities)

2024-11-07 21:46:33 R2GWRJJGF9 neo4j.notifications[60486] INFO Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Statement.CartesianProduct} {category: PERFORMANCE} {title: This query builds a cartesian product between disconnected patterns.} {description: If a part of a query contains multiple disconnected patterns, this will build a cartesian product between all those parts. This may produce a large amount of data and slow down query processing. While occasionally intended, it may often be possible to reformulate the query that avoids the use of this cross product, perhaps by adding a relationship between the different parts or by using OPTIONAL MATCH (identifier is: (b))} {position: line: 3, column: 5, offset: 40} for query: '\n    UNWIND $similar_entities AS se\n    MATCH (a:Entity {name: se.entity}), (b:Entity {name: se.similar_entity})\n    MERGE (a)-[:SIMILAR_TO]->(b)\n    '


### Step 4: Build vector index

In [41]:
build_vector_index(neo4j_factory, emb_dim=emb_dimension)

2024-11-07 21:46:38 R2GWRJJGF9 neo4j.notifications[60486] INFO Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Schema.IndexOrConstraintAlreadyExists} {category: SCHEMA} {title: `CREATE VECTOR INDEX entityIdx IF NOT EXISTS FOR (e:Entity) ON (e.embedding) OPTIONS {indexConfig: {`vector.dimensions`: $emb_dim, `vector.similarity_function`: $sim_func}}` has no effect.} {description: `VECTOR INDEX entityIdx FOR (e:Entity) ON (e.embedding)` already exists.} {position: None} for query: '\n    CREATE VECTOR INDEX $idx_name IF NOT EXISTS\n    FOR (m:Entity)\n    ON m.embedding\n    OPTIONS {indexConfig: {\n        `vector.dimensions`: $emb_dim,\n        `vector.similarity_function`: $sim_func\n    }}\n    '


# Retrieval

In [42]:
entity_master = Persona.from_yaml_file("../Personas/EntityMasterCrewAI.yaml")
# pprint(entity_master)

2024-11-07 21:46:44 R2GWRJJGF9 root[60486] INFO YAML data:
{'name': 'Entity_Master', 'roles': {'entity_extractor': {'name': 'Entity_Extractor', 'description': 'Identify and extract named entities from text.', 'agent_system_message': "You are an expert extracting named entities from given paragraphs of text.\nYou've done very reliably this kind of work thousands of times. Below there's\nand example of how you proceed with a task at hand.\n\n{entity_extractor_examples}\n", 'examples': [{'format': 'text', 'content': 'Example:\n\nParagraph:\n```\nRadio City\nRadio City is India\'s first private FM radio station and was started on 3 July 2001. It plays Hindi, English\nand regional songs. Radio City recently forayed into New Media in May 2008 with the launch of a music\nportal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related\nfeatures.\n```\n\nOutput:\n{{"named_entities": ["Radio City", "India", "3 July 2001", "Hindi", "English", "New Media", "May 

### Query Entity Extractor Agent

Extracts the entities from the user query to retrieve the best chunks later

In [43]:
user_query = "What does the hippocampal memory indexing theory propose?"
user_query = "What are the main three brain regions involved in the hippocampal memory indexing theory?"

In [44]:
entity_extractor_role: Role = entity_master.get_role("entity_extractor")
pprint(entity_extractor_role)
entity_extractor: Agent = entity_extractor_role.to_crewai_agent(verbose=True, allow_delegation=False)

extract_entities = Task(
    description=entity_extractor_role.tasks[0].description,
    expected_output=entity_extractor_role.tasks[0].expected_output,
    agent=entity_extractor,
    output_json=NamedEntities,
)

query_inputs = {
    "paragraph": user_query,
    # "paragraph": "What is HippoRAG?",
    "entity_extractor_examples": entity_extractor_role.get_examples_as_str(),
}

crew = Crew(
    agents=[entity_extractor],
    tasks=[extract_entities],
    verbose=True,
)

logger.info("^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Calling Agents ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")
result: CrewOutput = crew.kickoff(inputs=query_inputs)

logger.info(".................................................................................")
logger.info(type(result.json))
entities = json.loads(result.json)
query_entities = entities["named_entities"]
logger.info(f"Query entities: {query_entities}")
logger.info(".................................................................................")


2024-11-07 21:46:47 R2GWRJJGF9 root[60486] INFO ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Calling Agents ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[92m21:46:47 - LiteLLM:INFO[0m: utils.py:2749 - 
LiteLLM completion() model= gpt-4o-mini; provider = openai


[1m[95m# Agent:[00m [1m[92mEntity_Extractor[00m
[95m## Task:[00m [92mExtract named entities from this paragraph: ```What are the main three brain regions involved in the hippocampal memory indexing theory?```.
[00m


[92m21:46:48 - LiteLLM:INFO[0m: utils.py:944 - Wrapper: Completed Call, calling success_handler
2024-11-07 21:46:48 R2GWRJJGF9 root[60486] INFO .................................................................................
2024-11-07 21:46:48 R2GWRJJGF9 root[60486] INFO <class 'str'>
2024-11-07 21:46:48 R2GWRJJGF9 root[60486] INFO Query entities: ['hippocampal memory indexing theory']
2024-11-07 21:46:48 R2GWRJJGF9 root[60486] INFO .................................................................................




[1m[95m# Agent:[00m [1m[92mEntity_Extractor[00m
[95m## Final Answer:[00m [92m
{"named_entities": ["hippocampal memory indexing theory"]}[00m




In [45]:
related_nodes = retrieve_similar_entities(neo4j_factory, query_entities)

for node in related_nodes:
    logger.info(node)


2024-11-07 21:46:52 R2GWRJJGF9 httpx[60486] INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-11-07 21:46:54 R2GWRJJGF9 root[60486] INFO {'id': 59, 'name': 'hippocampal memory indexing theory', 'score': 0.999335765838623}
2024-11-07 21:46:54 R2GWRJJGF9 root[60486] INFO {'id': 61, 'name': 'hippocampal index', 'score': 0.8472342491149902}
2024-11-07 21:46:54 R2GWRJJGF9 root[60486] INFO {'id': 115, 'name': 'artificial hippocampal index', 'score': 0.8316090106964111}
2024-11-07 21:46:54 R2GWRJJGF9 root[60486] INFO {'id': 61, 'name': 'hippocampal index'}
2024-11-07 21:46:54 R2GWRJJGF9 root[60486] INFO {'id': 115, 'name': 'artificial hippocampal index'}
2024-11-07 21:46:54 R2GWRJJGF9 root[60486] INFO {'id': 115, 'name': 'artificial hippocampal index'}
2024-11-07 21:46:54 R2GWRJJGF9 root[60486] INFO {'id': 138, 'name': 'named entities to hippocampal index'}
2024-11-07 21:46:54 R2GWRJJGF9 root[60486] INFO {'id': 61, 'name': 'hippocampal index'}
2024-11-07 21:4

In [46]:
nodes_score = pagerank(neo4j_factory, related_nodes, matrix)

for score in nodes_score:
    logger.info(f"score= {score}")

2024-11-07 21:46:57 R2GWRJJGF9 root[60486] INFO Node sum59: 4.0 personalization: 0.027777777777777776
2024-11-07 21:46:57 R2GWRJJGF9 root[60486] INFO Node sum61: 5.0 personalization: 0.022222222222222223
2024-11-07 21:46:57 R2GWRJJGF9 root[60486] INFO Node sum115: 2.0 personalization: 0.05555555555555555
2024-11-07 21:46:57 R2GWRJJGF9 root[60486] INFO score= 0.0
2024-11-07 21:46:57 R2GWRJJGF9 root[60486] INFO score= 0.0
2024-11-07 21:46:57 R2GWRJJGF9 root[60486] INFO score= 0.0
2024-11-07 21:46:57 R2GWRJJGF9 root[60486] INFO score= 0.0
2024-11-07 21:46:57 R2GWRJJGF9 root[60486] INFO score= 0.0
2024-11-07 21:46:57 R2GWRJJGF9 root[60486] INFO score= 0.0
2024-11-07 21:46:57 R2GWRJJGF9 root[60486] INFO score= 0.01753782291569635
2024-11-07 21:46:57 R2GWRJJGF9 root[60486] INFO score= 0.0
2024-11-07 21:46:57 R2GWRJJGF9 root[60486] INFO score= 0.0
2024-11-07 21:46:57 R2GWRJJGF9 root[60486] INFO score= 0.0
2024-11-07 21:46:57 R2GWRJJGF9 root[60486] INFO score= 0.0
2024-11-07 21:46:57 R2GWRJJGF

### Rank the Chunks after PageRank

In [47]:
matrix.shape, len(nodes_score)

((173, 15), 173)

In [48]:
chunks_score, chunks_order = chunk_ranker(matrix, nodes_score)

[0.01753782 0.         0.00496905 0.         0.10792278 0.21793722
 0.05981245 0.13148897 0.22281563 0.13359349 0.1485994  0.04316614
 0.28399311 0.         0.04548127]
[12  8  5 10  9  7  4  6 14 11  0  2 13  3  1]


### Get Context

In [49]:
# Change the number of chunks to see the difference in the output!!!!!

# Try with 1, 2, 3...

context = retrieve_context(doc_structure, chunks_score, chunks_order, max_chunks=3)
context

'Figure 2: Detailed HippoRAG Methodology. We model the three components of human long-term\nmemory to mimic its pattern separation and completion functions. For offline indexing (Middle) ,\nwe use an LLM to process passages into open KG triples, which are then added to our artificial\nhippocampal index, while our synthetic parahippocampal regions (PHR) detect synonymy. In the\nexample above, triples involving Professor Thomas are extracted and integrated into the KG. For\nonline retrieval (Bottom) , our LLM neocortex extracts named entities from a query while our\nparahippocampal retrieval encoders link them to our hippocampal index. We then leverage the\nPersonalized PageRank algorithm to enable context-based retrieval and extract Professor Thomas.4\n(PPR) algorithm [ 23], a version of PageRank that distributes probability across a graph only through\na set of user-defined source nodes. This constraint allows us to bias the PPR output only towards the\n\nthe retrieval of complete memo

### HippoRAG Enhanced Query Agent

Answer the question based on the retrieved context using the HippoRAG

In [50]:
hippo_savant_role: Role = entity_master.get_role("hippo_savant")
hippo_savant: Agent = hippo_savant_role.to_crewai_agent(verbose=True, allow_delegation=False)

answer_question = Task(
    description=hippo_savant_role.tasks[0].description,
    expected_output=hippo_savant_role.tasks[0].expected_output,
    agent=hippo_savant,
)

query_inputs = {
    "context": context,
    "query": user_query,
}

crew = Crew(
    agents=[hippo_savant],
    tasks=[answer_question],
    verbose=True,
)

logger.info("^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Calling Agents ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")
result: CrewOutput = crew.kickoff(inputs=query_inputs)
logger.info(".................................................................................")
logger.info(f"Answer:\n{pprint(result.raw)}")
logger.info(".................................................................................")


2024-11-07 21:47:08 R2GWRJJGF9 root[60486] INFO ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Calling Agents ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[92m21:47:08 - LiteLLM:INFO[0m: utils.py:2749 - 
LiteLLM completion() model= gpt-4o-mini; provider = openai


[1m[95m# Agent:[00m [1m[92mHippo Savant[00m
[95m## Task:[00m [92mGiven a question and a context that provides useful information about it,
answer the question based only on the context.

Context:
```Figure 2: Detailed HippoRAG Methodology. We model the three components of human long-term
memory to mimic its pattern separation and completion functions. For offline indexing (Middle) ,
we use an LLM to process passages into open KG triples, which are then added to our artificial
hippocampal index, while our synthetic parahippocampal regions (PHR) detect synonymy. In the
example above, triples involving Professor Thomas are extracted and integrated into the KG. For
online retrieval (Bottom) , our LLM neocortex extracts named entities from a query while our
parahippocampal retrieval encoders link them to our hippocampal index. We then leverage the
Personalized PageRank algorithm to enable context-based retrieval and extract Professor Thomas.4
(PPR) algorithm [ 23], a version of Pag

[92m21:47:09 - LiteLLM:INFO[0m: utils.py:944 - Wrapper: Completed Call, calling success_handler
2024-11-07 21:47:09 R2GWRJJGF9 root[60486] INFO .................................................................................




[1m[95m# Agent:[00m [1m[92mHippo Savant[00m
[95m## Final Answer:[00m [92m
The main three brain regions involved in the hippocampal memory indexing theory are the neocortex, the parahippocampal regions (PHR), and the hippocampus.[00m




2024-11-07 21:47:09 R2GWRJJGF9 root[60486] INFO Answer:
None
2024-11-07 21:47:09 R2GWRJJGF9 root[60486] INFO .................................................................................
