In [1]:
import json
import os
import pickle

import numpy as np

from typing import Any, Dict, List, Literal, Optional

from llm_foundation import logger
from llm_foundation.agent_types import Persona, Role

from crewai import Agent, Task, Crew
from crewai.crews import CrewOutput
from hackathon.index import generate_entity_embeddings, create_index, search_index, calculate_scores, build_similar_entities
from hackathon.input_output_types import NamedEntities
from hackathon.retrieval_neo4j import chunk_ranker, retrieve_context, retrieve_similar_entities, pagerank
from hackathon.graph_neo4j import add_entities, add_relates_to_relationships, build_vector_index, add_similar_entities, clean_db
from hackathon.tools import filter_named_entities, create_document_deduped_entities_dict, create_matrix_entity_ref_count
from hackathon.utils import build_document_structure, save_document_structure, NEO4JIdentity
from langchain_community.graphs import Neo4jGraph
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.output_parsers.json import SimpleJsonOutputParser
from rich import print
from rich.pretty import pprint


from dotenv import load_dotenv

load_dotenv()


2024-11-06 22:24:28 R2GWRJJGF9 root[64677] INFO Logger root configured


Trying to configure logger root in module llm_foundation
root # of associated handlers - 0
Logging is not configured yet. Configuring it now.
Basic logging config


2024-11-06 22:24:29 R2GWRJJGF9 httpx[64677] INFO HTTP Request: GET https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json "HTTP/1.1 200 OK"
2024-11-06 22:24:30 R2GWRJJGF9 faiss.loader[64677] INFO Loading faiss.
2024-11-06 22:24:30 R2GWRJJGF9 faiss.loader[64677] INFO Successfully loaded faiss.


True

In [2]:
CHUNK_SIZE = 1000
CHUNK_LIMIT = 15  # 10 chunks for testing. -1 for all chunks
CHAR_OVERLAP = 200
document_name = "../2405.14831v1.pdf"
llm = "gpt-4o-mini"

# Create the Structure of the Document

In [3]:
# The chunk limit is set to 10 for testing purposes. Set it to -1 to process all chunks.
document_chunks = build_document_structure(document_name, chunk_size=CHUNK_SIZE, char_overlap=CHAR_OVERLAP, chunk_limit=CHUNK_LIMIT)

2024-11-06 22:24:34 R2GWRJJGF9 root[64677] INFO --------------------------------------------------------------------------------
2024-11-06 22:24:34 R2GWRJJGF9 root[64677] INFO Number of chunks: 101
2024-11-06 22:24:34 R2GWRJJGF9 root[64677] INFO --------------------------------------------------------------------------------


A document consist of a List of Chunks
Each Chunk is initially a dictionary with the following elements

```python
{
    id: int,
    text: str
}
```

In [4]:
document_chunks[1]

{'id': 1,
 'text': 'theory of human long-term memory to enable deeper and more efficient knowledge\nintegration over new experiences. HippoRAG synergistically orchestrates LLMs,\nknowledge graphs, and the Personalized PageRank algorithm to mimic the different\nroles of neocortex and hippocampus in human memory. We compare HippoRAG\nwith existing RAG methods on multi-hop question answering and show that our\nmethod outperforms the state-of-the-art methods remarkably, by up to 20%. Single-\nstep retrieval with HippoRAG achieves comparable or better performance than\niterative retrieval like IRCoT while being 10-30times cheaper and 6-13times faster,\nand integrating HippoRAG into IRCoT brings further substantial gains. Finally,\nwe show that our method can tackle new types of scenarios that are out of reach of\nexisting methods.1\n1 Introduction\nMillions of years of evolution have led mammalian brains to develop the crucial ability to store large'}

# Extract name entities from each chunk

In [5]:
extract_entities_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", """Your task is to extract entities from the given paragraph, in the same language as the paragraph.
Respond with a JSON list of entities."""),
        ("human", """Paragraph:
```
Radio City
Radio City is India's first private FM radio station and was started on 3 July 2001.
It plays Hindi, English and regional songs.
Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features.
```"""),
        ("ai", """{{"entities":
    ["Radio City", "India", "3 July 2001", "Hindi", "English", "May 2008", "PlanetRadiocity.com"]
}}"""),
        ("human", """Paragraph:```
{passage_text}
```"""),
    ]
)

extract_triplets_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", """Your task is to construct an RDF (Resource Description Framework) graph from the given passages and entity lists. 
Respond with a JSON list of triples, with each triple representing a relationship in the RDF graph. 

Pay attention to the following requirements:
- Each triple should contain at least one, but preferably two, of the named entities in the list for each passage.
- Clearly resolve pronouns to their specific names to maintain clarity.
"""),
        ("human", """Convert the paragraph into a JSON dict, it has a named entity list and a triple list.
Paragraph:
```
Radio City
Radio City is India's first private FM radio station and was started on 3 July 2001.
It plays Hindi, English and regional songs.
Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features.
```

{{"entities":
    ["Radio City", "India", "3 July 2001", "Hindi", "English", "May 2008", "PlanetRadiocity.com"]
}}"""),
        ("ai", """{{"triples": [
            ["Radio City", "located in", "India"],
            ["Radio City", "is", "private FM radio station"],
            ["Radio City", "started on", "3 July 2001"],
            ["Radio City", "plays songs in", "Hindi"],
            ["Radio City", "plays songs in", "English"]
            ["Radio City", "forayed into", "New Media"],
            ["Radio City", "launched", "PlanetRadiocity.com"],
            ["PlanetRadiocity.com", "launched in", "May 2008"],
            ["PlanetRadiocity.com", "is", "music portal"],
            ["PlanetRadiocity.com", "offers", "news"],
            ["PlanetRadiocity.com", "offers", "videos"],
            ["PlanetRadiocity.com", "offers", "songs"]
    ]
}}"""),
        ("human", """Convert the paragraph into a JSON dict, it has a entity list and a triple list.
Paragraph:
```
{passage_text}
```

{entities}"""),
    ]
)

In [6]:
def extend_document_chunks_with_entities_and_triples(llm_model, document_chunks: List[Dict]) -> List[Dict]:

    for chunk in document_chunks:
        chunk["named_entities"] =[]
        chunk["triples"] = []
        try:
            json_output_parser = SimpleJsonOutputParser()
            chain_entities = extract_entities_prompt | ChatOpenAI(model=llm_model, temperature=0.0) | json_output_parser
            named_entities = chain_entities.invoke({"passage_text": chunk["text"]})
            chunk["named_entities"] = named_entities["entities"]

            chain_triples = extract_triplets_prompt | ChatOpenAI(model=llm_model, temperature=0.0) | json_output_parser
            triples = chain_triples.invoke({"passage_text": chunk["text"], "entities": named_entities})
            chunk["triples"] = triples["triples"]
        except Exception as e:
            print(f"Error processing passage: {e}")
            continue
    
    return document_chunks

In [7]:
document_chunks_with_entities_and_triples = extend_document_chunks_with_entities_and_triples(llm, document_chunks)

2024-11-06 22:24:43 R2GWRJJGF9 httpx[64677] INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-06 22:24:46 R2GWRJJGF9 httpx[64677] INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-06 22:24:47 R2GWRJJGF9 httpx[64677] INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-06 22:24:49 R2GWRJJGF9 httpx[64677] INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-06 22:24:52 R2GWRJJGF9 httpx[64677] INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-06 22:24:54 R2GWRJJGF9 httpx[64677] INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-06 22:24:55 R2GWRJJGF9 httpx[64677] INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-06 22:24:57 R2GWRJJGF9 httpx[64677] INFO HTTP Request: POST https://api.openai.com/v1/cha

Now each Chunk is has also in the dictionary named entities and triples

```python
{
    id: int,
    text: str,
    named_entities: List[str]
}
```

In [8]:
document_chunks_with_entities_and_triples[1]

{'id': 1,
 'text': 'theory of human long-term memory to enable deeper and more efficient knowledge\nintegration over new experiences. HippoRAG synergistically orchestrates LLMs,\nknowledge graphs, and the Personalized PageRank algorithm to mimic the different\nroles of neocortex and hippocampus in human memory. We compare HippoRAG\nwith existing RAG methods on multi-hop question answering and show that our\nmethod outperforms the state-of-the-art methods remarkably, by up to 20%. Single-\nstep retrieval with HippoRAG achieves comparable or better performance than\niterative retrieval like IRCoT while being 10-30times cheaper and 6-13times faster,\nand integrating HippoRAG into IRCoT brings further substantial gains. Finally,\nwe show that our method can tackle new types of scenarios that are out of reach of\nexisting methods.1\n1 Introduction\nMillions of years of evolution have led mammalian brains to develop the crucial ability to store large',
 'named_entities': ['HippoRAG',
  'LLMs

## Save Document Structure

In [9]:
document_structure_file = f"{document_name.rsplit(".", 1)[0]}_document_structure.pkl"
save_document_structure(document_chunks_with_entities_and_triples, document_structure_file)

2024-11-06 22:25:46 R2GWRJJGF9 root[64677] INFO Saving document structure to ../2405.14831v1_document_structure.pkl


# Filter Named entities

In [10]:
document_chunks_with_entities_and_triples = filter_named_entities(document_chunks_with_entities_and_triples)

2024-11-06 22:25:47 R2GWRJJGF9 root[64677] INFO Initial Named Entities (11): ['hipporag', 'neurobiologically inspired', 'long-term memory', 'large language models', 'bernal jiménez gutiérrez', 'the ohio state university', 'yiheng shu', 'yu gu', 'michihiro yasunaga', 'stanford university', 'yu su']
2024-11-06 22:25:47 R2GWRJJGF9 root[64677] INFO Initial Named Entities after dedup (11): {'the ohio state university', 'bernal jiménez gutiérrez', 'hipporag', 'neurobiologically inspired', 'yiheng shu', 'yu gu', 'long-term memory', 'large language models', 'stanford university', 'yu su', 'michihiro yasunaga'}
2024-11-06 22:25:47 R2GWRJJGF9 root[64677] INFO Final Named Entities (11): {'the ohio state university', 'bernal jiménez gutiérrez', 'hipporag', 'neurobiologically inspired', 'yiheng shu', 'yu gu', 'long-term memory', 'large language models', 'stanford university', 'yu su', 'michihiro yasunaga'}
2024-11-06 22:25:47 R2GWRJJGF9 root[64677] INFO Initial Named Entities (8): ['hipporag', 'llm

## Save Document Structure after Filter Named Entities

In [11]:
document_structure_file_with_ne = f"{document_name.rsplit(".", 1)[0]}_document_structure_with_ne.pkl"
save_document_structure(document_chunks_with_entities_and_triples, document_structure_file_with_ne)

2024-11-06 22:25:48 R2GWRJJGF9 root[64677] INFO Saving document structure to ../2405.14831v1_document_structure_with_ne.pkl


In [12]:
document_chunks_with_entities_and_triples[1]

{'id': 1,
 'text': 'theory of human long-term memory to enable deeper and more efficient knowledge\nintegration over new experiences. HippoRAG synergistically orchestrates LLMs,\nknowledge graphs, and the Personalized PageRank algorithm to mimic the different\nroles of neocortex and hippocampus in human memory. We compare HippoRAG\nwith existing RAG methods on multi-hop question answering and show that our\nmethod outperforms the state-of-the-art methods remarkably, by up to 20%. Single-\nstep retrieval with HippoRAG achieves comparable or better performance than\niterative retrieval like IRCoT while being 10-30times cheaper and 6-13times faster,\nand integrating HippoRAG into IRCoT brings further substantial gains. Finally,\nwe show that our method can tackle new types of scenarios that are out of reach of\nexisting methods.1\n1 Introduction\nMillions of years of evolution have led mammalian brains to develop the crucial ability to store large',
 'named_entities': ['state-of-the-art m

# Dedup Entities

In [13]:
entity2uid_dict = create_document_deduped_entities_dict(document_chunks_with_entities_and_triples)

In [14]:
entity2uid_dict

{'the ohio state university': 0,
 'bernal jiménez gutiérrez': 1,
 'hipporag': 2,
 'neurobiologically inspired': 3,
 'yiheng shu': 4,
 'yu gu': 5,
 'long-term memory': 6,
 'large language models': 7,
 'stanford university': 8,
 'yu su': 9,
 'michihiro yasunaga': 10,
 'state-of-the-art methods': 11,
 'new types of scenarios': 12,
 'faster than ircot': 13,
 'neocortex': 14,
 'rag': 15,
 'multi-hop question answering': 16,
 'ircot': 17,
 'cheaper than ircot': 18,
 'hippocampus': 19,
 'comparable performance to ircot': 20,
 'llms': 21,
 'personalized pagerank': 22,
 'passage boundaries': 23,
 'retrieval-augmented generation': 24,
 'code': 25,
 'data': 26,
 'users to present new knowledge': 27,
 'world knowledge': 28,
 'new knowledge': 29,
 'current rag methods': 30,
 'humans': 31,
 'ai systems': 32,
 'https://github.com/osu-nlp-group/hipporag': 33,
 'knowledge': 34,
 'long-term memory in llms': 35,
 'model editing': 36,
 'alzheimer’s': 37,
 'cs.cl': 38,
 'prof. thomas': 39,
 '23 may 2024': 

## Save the entity to uid dict to a file

In [15]:
with open(f"{document_name.rsplit(".", 1)[0]}_entity2uid_dict.pkl", "wb") as f:
    pickle.dump(entity2uid_dict, f)
logger.info(f"entity2uid_dict has been saved to {document_name.rsplit(".", 1)[0]}_entity2uid_dict.pkl")

2024-11-06 22:26:08 R2GWRJJGF9 root[64677] INFO entity2uid_dict has been saved to ../2405.14831v1_entity2uid_dict.pkl


# Matrix Creation

This matrix is important for pagerank below. Each row is an entity id and each column represents a chunk. The contents of each cell is the number of references to the entity in that paragraph.

In [16]:
string = """manipulatable, likely higher-level, features, which are then routed through the parahippocampal
regions (phr) to be indexed by the hippocampus. when they reach the hippocampus , salient signals
are included in the hippocampal index and associated with each other."""

string.count("parahippocampal\nregions")

1

In [17]:
matrix = create_matrix_entity_ref_count(document_chunks_with_entities_and_triples, entity2uid_dict)

print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
np.set_printoptions(threshold=np.inf)
pprint(matrix.shape)
pprint(matrix)
print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")

#### Detail the name of the entity that is referenced and its count in each chunk

In [18]:
example_chunk = document_chunks_with_entities_and_triples[0]

n_of_entities = len(entity2uid_dict)
n_of_chunks = len(document_chunks_with_entities_and_triples)

for e_idx in range(n_of_entities):
    entity_name = list(entity2uid_dict.keys())[list(entity2uid_dict.values()).index(e_idx)]
    logger.info(f"Entity: {e_idx} {entity_name} Chunk count: {matrix[e_idx][:]}")


2024-11-06 22:26:18 R2GWRJJGF9 root[64677] INFO Entity: 0 the ohio state university Chunk count: [4. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
2024-11-06 22:26:18 R2GWRJJGF9 root[64677] INFO Entity: 1 bernal jiménez gutiérrez Chunk count: [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
2024-11-06 22:26:18 R2GWRJJGF9 root[64677] INFO Entity: 2 hipporag Chunk count: [2. 4. 0. 0. 1. 2. 2. 2. 0. 2. 0. 0. 1. 0. 0.]
2024-11-06 22:26:18 R2GWRJJGF9 root[64677] INFO Entity: 3 neurobiologically inspired Chunk count: [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
2024-11-06 22:26:18 R2GWRJJGF9 root[64677] INFO Entity: 4 yiheng shu Chunk count: [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
2024-11-06 22:26:18 R2GWRJJGF9 root[64677] INFO Entity: 5 yu gu Chunk count: [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
2024-11-06 22:26:18 R2GWRJJGF9 root[64677] INFO Entity: 6 long-term memory Chunk count: [1. 0. 3. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0.]
2024-11-06 22:26:18 R2GWRJJGF9 root[64677] INFO Entit

## Save the matrix to a file

In [19]:
with open(f"{document_name.rsplit(".", 1)[0]}_entity_per_chunk_count_matrix.pkl", "wb") as f:
    pickle.dump(matrix, f)
logger.info(f"Entity per chunk count matrix has been saved to {document_name.rsplit(".", 1)[0]}_entity_per_chunk_count_matrix.pkl")

2024-11-06 22:26:19 R2GWRJJGF9 root[64677] INFO Entity per chunk count matrix has been saved to ../2405.14831v1_entity_per_chunk_count_matrix.pkl


### Load the Matrix

In [20]:
matrix = pickle.loads(open(f"{document_name.rsplit(".", 1)[0]}_entity_per_chunk_count_matrix.pkl", "rb").read())
matrix

array([[ 4.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.],
       [ 2.,  4.,  0.,  0.,  1.,  2.,  2.,  2.,  0.,  2.,  0.,  0.,  1.,
         0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.],
       [ 1.,  0.,  3.,  0.,  0.,  0.,  0.,  2.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.],
       [ 2.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
       

# Index Creation

In [21]:
# Embeddings and FAISS index params
emb_dimension = 256
recall_at_k = 3  # how far in the indices/distances we go

# M_max defines the maximum number of links a vertex can have, and M_max0, which defines the same but for vertices in layer 0.
M = 64  # for HNSW index, the number of neighbors we add to each vertex on insertion. 
# Faiss sets M_max and M_max0 automatically in the set_default_probas method, at index initialization. 
# The M_max value is set to M, and M_max0 set to M*2

## Generating embeddings for named entities in document


In [22]:
named_entities_dict = pickle.loads(open(f"{document_name.rsplit(".", 1)[0]}_entity2uid_dict.pkl", "rb").read())
logger.info(f"Named entities dict loaded: {named_entities_dict}")
entities = list(named_entities_dict.keys())
logger.info(f"Number of entities: {len(entities)}. First entity is: {entities[0]}")

2024-11-06 22:26:27 R2GWRJJGF9 root[64677] INFO Named entities dict loaded: {'the ohio state university': 0, 'bernal jiménez gutiérrez': 1, 'hipporag': 2, 'neurobiologically inspired': 3, 'yiheng shu': 4, 'yu gu': 5, 'long-term memory': 6, 'large language models': 7, 'stanford university': 8, 'yu su': 9, 'michihiro yasunaga': 10, 'state-of-the-art methods': 11, 'new types of scenarios': 12, 'faster than ircot': 13, 'neocortex': 14, 'rag': 15, 'multi-hop question answering': 16, 'ircot': 17, 'cheaper than ircot': 18, 'hippocampus': 19, 'comparable performance to ircot': 20, 'llms': 21, 'personalized pagerank': 22, 'passage boundaries': 23, 'retrieval-augmented generation': 24, 'code': 25, 'data': 26, 'users to present new knowledge': 27, 'world knowledge': 28, 'new knowledge': 29, 'current rag methods': 30, 'humans': 31, 'ai systems': 32, 'https://github.com/osu-nlp-group/hipporag': 33, 'knowledge': 34, 'long-term memory in llms': 35, 'model editing': 36, 'alzheimer’s': 37, 'cs.cl': 38,

### Generate (and save) entity embeddings and convert them to np


In [23]:
embeddings_filepath = f"{document_name.rsplit('.', 1)[0]}_entity_embeddings.pkl"

In [24]:

entities_embeddings = generate_entity_embeddings(entities, emb_dimension, embeddings_filepath)


2024-11-06 22:26:31 R2GWRJJGF9 httpx[64677] INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


### Checkpoint Step: Load the entity Embeddings (Just to continue from here)

In [25]:
entities_embeddings = pickle.loads(open(embeddings_filepath, "rb").read())
entities_embeddings = np.array(entities_embeddings)

## Create Index and Query It with the same elements we indexed it

In [26]:
faiss_index = create_index(entities_embeddings, emb_dimension, M)
distances, indexes = search_index(faiss_index, entities_embeddings, recall_at_k)

2024-11-06 22:26:34 R2GWRJJGF9 root[64677] INFO 
Distances:
[[0.    0.985 1.253]
 [0.    1.298 1.317]
 [0.    0.424 0.847]
 [0.    1.018 1.02 ]
 [0.    0.927 0.932]
 [0.    0.799 0.927]
 [0.    0.345 0.472]
 [0.    1.124 1.128]
 [0.    0.231 0.632]
 [0.    0.799 0.932]
 [0.    1.063 1.274]
 [0.    1.115 1.132]
 [0.    1.12  1.145]
 [0.    0.409 0.533]
 [0.    0.442 0.605]
 [0.    1.055 1.117]
 [0.    0.466 0.478]
 [0.    0.654 0.671]
 [0.    0.409 0.505]
 [0.    0.591 0.753]
 [0.    0.505 0.533]
 [0.    0.414 0.817]
 [0.    0.509 1.08 ]
 [0.    0.539 0.765]
 [0.    0.69  0.878]
 [0.    1.043 1.066]
 [0.    1.032 1.066]
 [0.    0.717 0.986]
 [0.    0.75  0.784]
 [0.    0.43  0.716]
 [0.    1.103 1.158]
 [0.    1.113 1.157]
 [0.    0.972 1.039]
 [0.    0.689 0.866]
 [0.    0.716 0.784]
 [0.    0.076 0.481]
 [0.    1.076 1.107]
 [0.    0.527 0.631]
 [0.    1.031 1.054]
 [0.    0.184 0.551]
 [0.    1.344 1.357]
 [0.    0.231 0.456]
 [0.    1.109 1.129]
 [0.    0.527 0.643]
 [0.    0.526 1.

## Build Similar Entities with Recall at K

In [27]:
similar_entities = build_similar_entities(entities, indexes, distances, recall_at_k, max_distance=0.85)  # Original max_distance=0.7
logger.info(f"Similar entities:\n{similar_entities}")

# TODO Scores discarded for now
# scores = calculate_scores(distances)
# similar_entities_score = build_similar_entities_with_scores(entities, indexes, scores, recall_at_k, min_score=0.5)            
# logger.info(similar_entities_score)

2024-11-06 22:26:36 R2GWRJJGF9 root[64677] INFO Similarity (<=0.85 dist) found for hipporag (2) with hipporag methodology (141): Distance 0.42426562309265137
2024-11-06 22:26:36 R2GWRJJGF9 root[64677] INFO Similarity (<=0.85 dist) found for hipporag (2) with detailed example of the hipporag process (101): Distance 0.8471661806106567
2024-11-06 22:26:36 R2GWRJJGF9 root[64677] INFO Similarity (<=0.85 dist) found for yu gu (5) with yu su (9): Distance 0.7985775470733643
2024-11-06 22:26:36 R2GWRJJGF9 root[64677] INFO Similarity (<=0.85 dist) found for long-term memory (6) with human long-term memory (53): Distance 0.3448185920715332
2024-11-06 22:26:36 R2GWRJJGF9 root[64677] INFO Similarity (<=0.85 dist) found for long-term memory (6) with long-term memory for llms (56): Distance 0.4722738265991211
2024-11-06 22:26:36 R2GWRJJGF9 root[64677] INFO Similarity (<=0.85 dist) found for stanford university (8) with stanford (41): Distance 0.23052369058132172
2024-11-06 22:26:36 R2GWRJJGF9 root[6

## Create Neo4J Graph

In [28]:
NEO4J_URI = os.environ["NEO4J_URI"]
NEO4J_USERNAME = os.environ["NEO4J_USERNAME"]
NEO4J_PASSWORD = os.environ["NEO4J_PASSWORD"]
NEO4J_DATABASE = os.environ["NEO4J_DB"] 

neo4j_conn = NEO4JIdentity(uri=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD)

kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

Required to create the graph:
    - entity_embeddings
    - named entities dict 
    - doc structure
    - similar entities

### Step 1: Add all entities to the graph

In [29]:
clean_db(kg)

2024-11-06 22:26:51 R2GWRJJGF9 root[64677] INFO Cleaning Neo4j database
2024-11-06 22:26:52 R2GWRJJGF9 root[64677] INFO Result after deleting nodes:
[]
2024-11-06 22:26:52 R2GWRJJGF9 root[64677] INFO Result after properties:
[]


In [30]:
len(named_entities_dict), len(entities_embeddings)

(174, 174)

In [31]:
add_entities(kg, entities_embeddings, named_entities_dict)

### Step 2: Add RELATES_TO relationships

In [32]:
doc_structure = pickle.loads(open(f"{document_name.rsplit(".", 1)[0]}_document_structure_with_ne.pkl", "rb").read())
doc_structure

[{'id': 0,
  'text': 'HippoRAG: Neurobiologically Inspired\nLong-Term Memory for Large Language Models\nBernal Jiménez Gutiérrez\nThe Ohio State University\njimenezgutierrez.1@osu.eduYiheng Shu\nThe Ohio State University\nshu.251@osu.edu\nYu Gu\nThe Ohio State University\ngu.826@osu.eduMichihiro Yasunaga\nStanford University\nmyasu@cs.stanford.eduYu Su\nThe Ohio State University\nsu.809@osu.edu\nAbstract\nIn order to thrive in hostile and ever-changing natural environments, mammalian\nbrains evolved to store large amounts of knowledge about the world and continually\nintegrate new information while avoiding catastrophic forgetting. Despite the\nimpressive accomplishments, large language models (LLMs), even with retrieval-\naugmented generation (RAG), still struggle to efficiently and effectively integrate\na large amount of new experiences after pre-training. In this work, we introduce\nHippoRAG, a novel retrieval framework inspired by the hippocampal indexing',
  'named_entities': ['t

In [33]:
add_relates_to_relationships(kg, doc_structure)


2024-11-06 22:27:02 R2GWRJJGF9 neo4j.notifications[64677] INFO Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Statement.CartesianProduct} {category: PERFORMANCE} {title: This query builds a cartesian product between disconnected patterns.} {description: If a part of a query contains multiple disconnected patterns, this will build a cartesian product between all those parts. This may produce a large amount of data and slow down query processing. While occasionally intended, it may often be possible to reformulate the query that avoids the use of this cross product, perhaps by adding a relationship between the different parts or by using OPTIONAL MATCH (identifier is: (b))} {position: line: 3, column: 5, offset: 37} for query: '\n    UNWIND $triplets AS triplet\n    MATCH (a:Entity {name: triplet.subject}), (b:Entity {name: triplet.object})\n    MERGE (a)-[:RELATES_TO {type: triplet.predicate}]->(b)\n    '


### Step 3: Add SIMILAR_TO relationships

In [34]:
add_similar_entities(kg, similar_entities)

2024-11-06 22:27:03 R2GWRJJGF9 neo4j.notifications[64677] INFO Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Statement.CartesianProduct} {category: PERFORMANCE} {title: This query builds a cartesian product between disconnected patterns.} {description: If a part of a query contains multiple disconnected patterns, this will build a cartesian product between all those parts. This may produce a large amount of data and slow down query processing. While occasionally intended, it may often be possible to reformulate the query that avoids the use of this cross product, perhaps by adding a relationship between the different parts or by using OPTIONAL MATCH (identifier is: (b))} {position: line: 3, column: 5, offset: 40} for query: '\n    UNWIND $similar_entities AS se\n    MATCH (a:Entity {name: se.entity}), (b:Entity {name: se.similar_entity})\n    MERGE (a)-[:SIMILAR_TO]->(b)\n    '


### Step 4: Build vector index

In [35]:
build_vector_index(kg, emb_dim=emb_dimension)

2024-11-06 22:27:04 R2GWRJJGF9 neo4j.notifications[64677] INFO Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Schema.IndexOrConstraintAlreadyExists} {category: SCHEMA} {title: `CREATE VECTOR INDEX entityIdx IF NOT EXISTS FOR (e:Entity) ON (e.embedding) OPTIONS {indexConfig: {`vector.dimensions`: $emb_dim, `vector.similarity_function`: $sim_func}}` has no effect.} {description: `VECTOR INDEX entityIdx FOR (e:Entity) ON (e.embedding)` already exists.} {position: None} for query: '\n    CREATE VECTOR INDEX $idx_name IF NOT EXISTS\n    FOR (m:Entity)\n    ON m.embedding\n    OPTIONS {indexConfig: {\n        `vector.dimensions`: $emb_dim,\n        `vector.similarity_function`: $sim_func\n    }}\n    '


# Retrieval

In [36]:
entity_master = Persona.from_yaml_file("../Personas/EntityMasterCrewAI.yaml")
# pprint(entity_master)

2024-11-06 22:27:06 R2GWRJJGF9 root[64677] INFO YAML data:
{'name': 'Entity_Master', 'roles': {'entity_extractor': {'name': 'Entity_Extractor', 'description': 'Identify and extract named entities from text.', 'agent_system_message': "You are an expert extracting named entities from given paragraphs of text.\nYou've done very reliably this kind of work thousands of times. Below there's\nand example of how you proceed with a task at hand.\n\n{entity_extractor_examples}\n", 'examples': [{'format': 'text', 'content': 'Example:\n\nParagraph:\n```\nRadio City\nRadio City is India\'s first private FM radio station and was started on 3 July 2001. It plays Hindi, English\nand regional songs. Radio City recently forayed into New Media in May 2008 with the launch of a music\nportal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related\nfeatures.\n```\n\nOutput:\n{{"named_entities": ["Radio City", "India", "3 July 2001", "Hindi", "English", "New Media", "May 

### Query Entity Extractor Agent

Extracts the entities from the user query to retrieve the best chunks later

In [37]:
user_query = "What does the hippocampal memory indexing theory propose?"
user_query = "What are the main three brain regions involved in the hippocampal memory indexing theory?"

In [38]:
entity_extractor_role: Role = entity_master.get_role("entity_extractor")
pprint(entity_extractor_role)
entity_extractor: Agent = entity_extractor_role.to_crewai_agent(verbose=True, allow_delegation=False)

extract_entities = Task(
    description=entity_extractor_role.tasks[0].description,
    expected_output=entity_extractor_role.tasks[0].expected_output,
    agent=entity_extractor,
    output_json=NamedEntities,
)

query_inputs = {
    "paragraph": user_query,
    # "paragraph": "What is HippoRAG?",
    "entity_extractor_examples": entity_extractor_role.get_examples_as_str(),
}

crew = Crew(
    agents=[entity_extractor],
    tasks=[extract_entities],
    verbose=True,
)

logger.info("^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Calling Agents ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")
result: CrewOutput = crew.kickoff(inputs=query_inputs)

logger.info(".................................................................................")
logger.info(type(result.json))
entities = json.loads(result.json)
query_entities = entities["named_entities"]
logger.info(f"Query entities: {query_entities}")
logger.info(".................................................................................")


2024-11-06 22:27:09 R2GWRJJGF9 root[64677] INFO ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Calling Agents ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[92m22:27:09 - LiteLLM:INFO[0m: utils.py:2749 - 
LiteLLM completion() model= gpt-4o-mini; provider = openai


[1m[95m# Agent:[00m [1m[92mEntity_Extractor[00m
[95m## Task:[00m [92mExtract named entities from this paragraph: ```What are the main three brain regions involved in the hippocampal memory indexing theory?```.
[00m


[92m22:27:10 - LiteLLM:INFO[0m: utils.py:944 - Wrapper: Completed Call, calling success_handler
2024-11-06 22:27:10 R2GWRJJGF9 root[64677] INFO .................................................................................
2024-11-06 22:27:10 R2GWRJJGF9 root[64677] INFO <class 'str'>
2024-11-06 22:27:10 R2GWRJJGF9 root[64677] INFO Query entities: ['hippocampal memory indexing theory']
2024-11-06 22:27:10 R2GWRJJGF9 root[64677] INFO .................................................................................




[1m[95m# Agent:[00m [1m[92mEntity_Extractor[00m
[95m## Final Answer:[00m [92m
{"named_entities": ["hippocampal memory indexing theory"]}[00m




In [39]:
related_nodes = retrieve_similar_entities(neo4j_conn, query_entities)

for node in related_nodes:
    logger.info(node)


2024-11-06 22:27:13 R2GWRJJGF9 httpx[64677] INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-11-06 22:27:15 R2GWRJJGF9 root[64677] INFO {'id': 61, 'name': 'hippocampal memory indexing theory', 'score': 0.9994369745254517}
2024-11-06 22:27:15 R2GWRJJGF9 root[64677] INFO {'id': 55, 'name': 'hippocampal index', 'score': 0.847712516784668}
2024-11-06 22:27:15 R2GWRJJGF9 root[64677] INFO {'id': 114, 'name': 'artificial hippocampal index', 'score': 0.8320860862731934}
2024-11-06 22:27:15 R2GWRJJGF9 root[64677] INFO {'id': 114, 'name': 'artificial hippocampal index'}
2024-11-06 22:27:15 R2GWRJJGF9 root[64677] INFO {'id': 129, 'name': 'named entities to hippocampal index'}
2024-11-06 22:27:15 R2GWRJJGF9 root[64677] INFO {'id': 55, 'name': 'hippocampal index'}
2024-11-06 22:27:15 R2GWRJJGF9 root[64677] INFO {'id': 114, 'name': 'artificial hippocampal index'}
2024-11-06 22:27:15 R2GWRJJGF9 root[64677] INFO {'id': 55, 'name': 'hippocampal index'}
2024-11-06 22:2

In [40]:
nodes_score = pagerank(neo4j_conn, related_nodes, matrix)

for score in nodes_score:
    logger.info(f"score= {score}")

2024-11-06 22:27:18 R2GWRJJGF9 root[64677] INFO Node sum61: 4.0 personalization: 0.027777777777777776
2024-11-06 22:27:18 R2GWRJJGF9 root[64677] INFO Node sum55: 5.0 personalization: 0.022222222222222223
2024-11-06 22:27:18 R2GWRJJGF9 root[64677] INFO Node sum114: 2.0 personalization: 0.05555555555555555
2024-11-06 22:27:18 R2GWRJJGF9 root[64677] INFO score= 0.0
2024-11-06 22:27:18 R2GWRJJGF9 root[64677] INFO score= 0.0
2024-11-06 22:27:18 R2GWRJJGF9 root[64677] INFO score= 0.0
2024-11-06 22:27:18 R2GWRJJGF9 root[64677] INFO score= 0.0
2024-11-06 22:27:18 R2GWRJJGF9 root[64677] INFO score= 0.0
2024-11-06 22:27:18 R2GWRJJGF9 root[64677] INFO score= 0.0
2024-11-06 22:27:18 R2GWRJJGF9 root[64677] INFO score= 0.025366590126708957
2024-11-06 22:27:18 R2GWRJJGF9 root[64677] INFO score= 0.0
2024-11-06 22:27:18 R2GWRJJGF9 root[64677] INFO score= 0.0
2024-11-06 22:27:18 R2GWRJJGF9 root[64677] INFO score= 0.0
2024-11-06 22:27:18 R2GWRJJGF9 root[64677] INFO score= 0.0
2024-11-06 22:27:18 R2GWRJJG

### Rank the Chunks after PageRank

In [41]:
matrix.shape, len(nodes_score)

((174, 15), 174)

In [42]:
chunks_score, chunks_order = chunk_ranker(matrix, nodes_score)

[0.02536659 0.         0.08328697 0.         0.         0.33475344
 0.         0.48473813 0.04175341 0.19662181 0.31668745 0.0244703
 0.22389647 0.         0.07828229]
[ 7  5 10 12  9  2 14  8  0 11 13  6  4  3  1]


### Get Context

In [47]:
# Change the number of chunks to see the difference in the output!!!!!

# Try with 1, 2, 3...

context = retrieve_context(doc_structure, chunks_score, chunks_order, max_chunks=3)
context

'a case study illustrating the limitations of current methods as well as our method’s potential on the\npreviously discussed path-finding multi-hop QA setting.\n2 HippoRAG\nIn this section, we first give a brief overview of the hippocampal memory indexing theory, followed\nby how HippoRAG’s indexing and retrieval design was inspired by this theory, and finally offer a\nmore detailed account of our methodology.\n2.1 The Hippocampal Memory Indexing Theory\nThe hippocampal memory indexing theory [ 58] is a well-established theory that provides a functional\ndescription of the components and circuitry involved in human long-term memory. In this theory,\nTeyler and Discenna [58] propose that human long-term memory is composed of three components\nthat work together to accomplish two main objectives: pattern separation , which ensures that the\nrepresentations of distinct perceptual experiences are unique, and pattern completion , which enables\n\nrelative ease. The hippocampal memory indexi

### HippoRAG Enhanced Query Agent

Answer the question based on the retrieved context using the HippoRAG

In [48]:
hippo_savant_role: Role = entity_master.get_role("hippo_savant")
hippo_savant: Agent = hippo_savant_role.to_crewai_agent(verbose=True, allow_delegation=False)

answer_question = Task(
    description=hippo_savant_role.tasks[0].description,
    expected_output=hippo_savant_role.tasks[0].expected_output,
    agent=hippo_savant,
)

query_inputs = {
    "context": context,
    "query": user_query,
}

crew = Crew(
    agents=[hippo_savant],
    tasks=[answer_question],
    verbose=True,
)

logger.info("^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Calling Agents ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")
result: CrewOutput = crew.kickoff(inputs=query_inputs)
logger.info(".................................................................................")
logger.info(f"Answer:\n{pprint(result.raw)}")
logger.info(".................................................................................")


2024-11-06 22:29:12 R2GWRJJGF9 root[64677] INFO ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Calling Agents ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[92m22:29:12 - LiteLLM:INFO[0m: utils.py:2749 - 
LiteLLM completion() model= gpt-4o-mini; provider = openai


[1m[95m# Agent:[00m [1m[92mHippo Savant[00m
[95m## Task:[00m [92mGiven a question and a context that provides useful information about it,
answer the question based only on the context.

Context:
```a case study illustrating the limitations of current methods as well as our method’s potential on the
previously discussed path-finding multi-hop QA setting.
2 HippoRAG
In this section, we first give a brief overview of the hippocampal memory indexing theory, followed
by how HippoRAG’s indexing and retrieval design was inspired by this theory, and finally offer a
more detailed account of our methodology.
2.1 The Hippocampal Memory Indexing Theory
The hippocampal memory indexing theory [ 58] is a well-established theory that provides a functional
description of the components and circuitry involved in human long-term memory. In this theory,
Teyler and Discenna [58] propose that human long-term memory is composed of three components
that work together to accomplish two main objective

[92m22:29:13 - LiteLLM:INFO[0m: utils.py:944 - Wrapper: Completed Call, calling success_handler
2024-11-06 22:29:13 R2GWRJJGF9 root[64677] INFO .................................................................................




[1m[95m# Agent:[00m [1m[92mHippo Savant[00m
[95m## Final Answer:[00m [92m
The main three brain regions involved in the hippocampal memory indexing theory are the hippocampus, the neocortex, and the parahippocampal regions.[00m




2024-11-06 22:29:13 R2GWRJJGF9 root[64677] INFO Answer:
None
2024-11-06 22:29:13 R2GWRJJGF9 root[64677] INFO .................................................................................
