In [1]:
import os
import pickle

import numpy as np

from typing import Dict, List, Literal, Optional

from llm_foundation import logger
from rich import print
from rich.pretty import pprint

from hackathon.index import generate_entity_embeddings, create_index, search_index, calculate_scores, build_similar_entities
from hackathon.graph_neo4j import add_entities, add_relates_to_relationships, build_vector_index, add_similar_entities, clean_db
from hackathon.tools import filter_named_entities, create_document_deduped_entities_dict, create_matrix_entity_ref_count
from hackathon.utils import build_document_structure, save_document_structure

from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.graphs import Neo4jGraph
from langchain_core.prompts import ChatPromptTemplate
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain.output_parsers.json import SimpleJsonOutputParser

from dotenv import load_dotenv

load_dotenv()


2024-11-05 13:02:41 R2GWRJJGF9 root[36374] INFO Logger root configured
2024-11-05 13:02:41 R2GWRJJGF9 faiss.loader[36374] INFO Loading faiss.


Trying to configure logger root in module llm_foundation
root # of associated handlers - 0
Logging is not configured yet. Configuring it now.
Basic logging config


2024-11-05 13:02:41 R2GWRJJGF9 faiss.loader[36374] INFO Successfully loaded faiss.


True

In [7]:
CHUNK_SIZE = 5000
document_name = "../2405.14831v1.pdf"
llm = "gpt-4o-mini"

# Create the Structure of the Document

In [4]:
document_chunks = build_document_structure(document_name, chunk_size=CHUNK_SIZE)

2024-11-05 11:29:10 R2GWRJJGF9 root[40146] INFO --------------------------------------------------------------------------------
2024-11-05 11:29:10 R2GWRJJGF9 root[40146] INFO Number of chunks: 19
2024-11-05 11:29:10 R2GWRJJGF9 root[40146] INFO --------------------------------------------------------------------------------


A document consist of a List of Chunks
Each Chunk is initially a dictionary with the following elements

```python
{
    id: int,
    text: str
}
```

In [5]:
document_chunks[0]

{'id': 0,
 'text': 'HippoRAG: Neurobiologically Inspired\nLong-Term Memory for Large Language Models\nBernal Jiménez Gutiérrez\nThe Ohio State University\njimenezgutierrez.1@osu.eduYiheng Shu\nThe Ohio State University\nshu.251@osu.edu\nYu Gu\nThe Ohio State University\ngu.826@osu.eduMichihiro Yasunaga\nStanford University\nmyasu@cs.stanford.eduYu Su\nThe Ohio State University\nsu.809@osu.edu\nAbstract\nIn order to thrive in hostile and ever-changing natural environments, mammalian\nbrains evolved to store large amounts of knowledge about the world and continually\nintegrate new information while avoiding catastrophic forgetting. Despite the\nimpressive accomplishments, large language models (LLMs), even with retrieval-\naugmented generation (RAG), still struggle to efficiently and effectively integrate\na large amount of new experiences after pre-training. In this work, we introduce\nHippoRAG, a novel retrieval framework inspired by the hippocampal indexing\ntheory of human long-term 

# Extract name entities from each chunk

In [6]:
extract_entities_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", """Your task is to extract entities from the given paragraph, in the same language as the paragraph.
Respond with a JSON list of entities."""),
        ("human", """Paragraph:
```
Radio City
Radio City is India's first private FM radio station and was started on 3 July 2001.
It plays Hindi, English and regional songs.
Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features.
```"""),
        ("ai", """{{"entities":
    ["Radio City", "India", "3 July 2001", "Hindi", "English", "May 2008", "PlanetRadiocity.com"]
}}"""),
        ("human", """Paragraph:```
{passage_text}
```"""),
    ]
)

extract_triplets_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", """Your task is to construct an RDF (Resource Description Framework) graph from the given passages and entity lists. 
Respond with a JSON list of triples, with each triple representing a relationship in the RDF graph. 

Pay attention to the following requirements:
- Each triple should contain at least one, but preferably two, of the named entities in the list for each passage.
- Clearly resolve pronouns to their specific names to maintain clarity.
"""),
        ("human", """Convert the paragraph into a JSON dict, it has a named entity list and a triple list.
Paragraph:
```
Radio City
Radio City is India's first private FM radio station and was started on 3 July 2001.
It plays Hindi, English and regional songs.
Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features.
```

{{"entities":
    ["Radio City", "India", "3 July 2001", "Hindi", "English", "May 2008", "PlanetRadiocity.com"]
}}"""),
        ("ai", """{{"triples": [
            ["Radio City", "located in", "India"],
            ["Radio City", "is", "private FM radio station"],
            ["Radio City", "started on", "3 July 2001"],
            ["Radio City", "plays songs in", "Hindi"],
            ["Radio City", "plays songs in", "English"]
            ["Radio City", "forayed into", "New Media"],
            ["Radio City", "launched", "PlanetRadiocity.com"],
            ["PlanetRadiocity.com", "launched in", "May 2008"],
            ["PlanetRadiocity.com", "is", "music portal"],
            ["PlanetRadiocity.com", "offers", "news"],
            ["PlanetRadiocity.com", "offers", "videos"],
            ["PlanetRadiocity.com", "offers", "songs"]
    ]
}}"""),
        ("human", """Convert the paragraph into a JSON dict, it has a entity list and a triple list.
Paragraph:
```
{passage_text}
```

{entities}"""),
    ]
)

In [14]:
def extend_document_chunks_with_entities_and_triples(llm_model, document_chunks: List[Dict]) -> List[Dict]:

    for chunk in document_chunks:
        chunk["named_entities"] =[]
        chunk["triples"] = []
        try:
            json_output_parser = SimpleJsonOutputParser()
            chain_entities = extract_entities_prompt | ChatOpenAI(model=llm_model, temperature=0.0) | json_output_parser
            named_entities = chain_entities.invoke({"passage_text": chunk["text"]})
            chunk["named_entities"] = named_entities["entities"]

            chain_triples = extract_triplets_prompt | ChatOpenAI(model=llm_model, temperature=0.0) | json_output_parser
            triples = chain_triples.invoke({"passage_text": chunk["text"], "entities": named_entities})
            chunk["triples"] = triples["triples"]
        except Exception as e:
            print(f"Error processing passage: {e}")
            continue
    
    return document_chunks

In [15]:
document_chunks_with_entities_and_triples = extend_document_chunks_with_entities_and_triples(llm, document_chunks)

2024-11-05 11:33:05 R2GWRJJGF9 httpx[40146] INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-05 11:33:08 R2GWRJJGF9 httpx[40146] INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Now each Chunk is has also in the dictionary named entities and triples

```python
{
    id: int,
    text: str,
    named_entities: List[str]
}
```

In [16]:
document_chunks_with_entities_and_triples[0]

{'id': 0,
 'text': 'HippoRAG: Neurobiologically Inspired\nLong-Term Memory for Large Language Models\nBernal Jiménez Gutiérrez\nThe Ohio State University\njimenezgutierrez.1@osu.eduYiheng Shu\nThe Ohio State University\nshu.251@osu.edu\nYu Gu\nThe Ohio State University\ngu.826@osu.eduMichihiro Yasunaga\nStanford University\nmyasu@cs.stanford.eduYu Su\nThe Ohio State University\nsu.809@osu.edu\nAbstract\nIn order to thrive in hostile and ever-changing natural environments, mammalian\nbrains evolved to store large amounts of knowledge about the world and continually\nintegrate new information while avoiding catastrophic forgetting. Despite the\nimpressive accomplishments, large language models (LLMs), even with retrieval-\naugmented generation (RAG), still struggle to efficiently and effectively integrate\na large amount of new experiences after pre-training. In this work, we introduce\nHippoRAG, a novel retrieval framework inspired by the hippocampal indexing\ntheory of human long-term 

## Save Document Structure

In [17]:
document_structure_file = f"{document_name.rsplit(".", 1)[0]}_document_structure.pkl"
save_document_structure(document_chunks_with_entities_and_triples, document_structure_file)

2024-11-05 11:33:25 R2GWRJJGF9 root[40146] INFO Saving document structure to ../2405.14831v1_document_structure.pkl


# Filter Named entities

In [18]:
document_chunks_with_entities_and_triples = filter_named_entities(document_chunks_with_entities_and_triples)

2024-11-05 11:33:31 R2GWRJJGF9 root[40146] INFO Initial Named Entities (24): ['hipporag', 'neurobiologically inspired long-term memory for large language models', 'bernal jiménez gutiérrez', 'the ohio state university', 'yiheng shu', 'yu gu', 'michihiro yasunaga', 'stanford university', 'yu su', 'mammalian brains', 'large language models (llms)', 'retrieval-augmented generation (rag)', 'personalized pagerank', 'neocortex', 'hippocampus', 'ircot', 'alzheimer’s', 'prof. thomas', 'c-shaped hippocampus', 'scientific literature review', 'legal case briefing', 'medical diagnosis', 'multi-hop question answering (qa)', 'hippocampal memory indexing theory']
2024-11-05 11:33:31 R2GWRJJGF9 root[40146] INFO Initial Named Entities after dedup (24): {'stanford university', 'the ohio state university', 'yu su', 'hippocampal memory indexing theory', 'hippocampus', 'medical diagnosis', 'retrieval-augmented generation (rag)', 'ircot', 'large language models (llms)', 'neurobiologically inspired long-term

## Save Document Structure after Filter Named Entities

In [19]:
document_structure_file_with_ne = f"{document_name.rsplit(".", 1)[0]}_document_structure_with_ne.pkl"
save_document_structure(document_chunks_with_entities_and_triples, document_structure_file_with_ne)

2024-11-05 11:33:36 R2GWRJJGF9 root[40146] INFO Saving document structure to ../2405.14831v1_document_structure_with_ne.pkl


In [20]:
document_chunks_with_entities_and_triples[0]

{'id': 0,
 'text': 'HippoRAG: Neurobiologically Inspired\nLong-Term Memory for Large Language Models\nBernal Jiménez Gutiérrez\nThe Ohio State University\njimenezgutierrez.1@osu.eduYiheng Shu\nThe Ohio State University\nshu.251@osu.edu\nYu Gu\nThe Ohio State University\ngu.826@osu.eduMichihiro Yasunaga\nStanford University\nmyasu@cs.stanford.eduYu Su\nThe Ohio State University\nsu.809@osu.edu\nAbstract\nIn order to thrive in hostile and ever-changing natural environments, mammalian\nbrains evolved to store large amounts of knowledge about the world and continually\nintegrate new information while avoiding catastrophic forgetting. Despite the\nimpressive accomplishments, large language models (LLMs), even with retrieval-\naugmented generation (RAG), still struggle to efficiently and effectively integrate\na large amount of new experiences after pre-training. In this work, we introduce\nHippoRAG, a novel retrieval framework inspired by the hippocampal indexing\ntheory of human long-term 

# Dedup Entities

In [21]:
entity2uid_dict = create_document_deduped_entities_dict(document_chunks_with_entities_and_triples)

In [27]:
entity2uid_dict

{'stanford university': 0,
 'the ohio state university': 1,
 'yu su': 2,
 'hippocampal memory indexing theory': 3,
 'hippocampus': 4,
 'medical diagnosis': 5,
 'retrieval-augmented generation (rag)': 6,
 'ircot': 7,
 'large language models (llms)': 8,
 'neurobiologically inspired long-term memory for large language models': 9,
 'neocortex': 10,
 'scientific literature review': 11,
 'mammalian brains': 12,
 'hipporag': 13,
 'store knowledge': 14,
 'bernal jiménez gutiérrez': 15,
 'legal case briefing': 16,
 'yu gu': 17,
 'multi-hop question answering (qa)': 18,
 'personalized pagerank': 19,
 'human memory': 20,
 'state-of-the-art methods': 21,
 'new experiences': 22,
 'long-term memory in llms': 23,
 'yiheng shu': 24,
 'alzheimer’s': 25,
 'existing rag methods': 26,
 'michihiro yasunaga': 27,
 'prof. thomas': 28,
 'c-shaped hippocampus': 29,
 'knowledge integration': 30}

## Save the entity to uid dict to a file

In [28]:
with open(f"{document_name.rsplit(".", 1)[0]}_entity2uid_dict.pkl", "wb") as f:
    pickle.dump(entity2uid_dict, f)
logger.info(f"entity2uid_dict has been saved to {document_name.rsplit(".", 1)[0]}_entity2uid_dict.pkl")

2024-11-05 11:38:08 R2GWRJJGF9 root[40146] INFO entity2uid_dict has been saved to ../2405.14831v1_entity2uid_dict.pkl


# Matrix Creation

In [29]:
matrix = create_matrix_entity_ref_count(document_chunks_with_entities_and_triples, entity2uid_dict)

print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
print(matrix)
print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")

In [31]:
example_chunk = document_chunks_with_entities_and_triples[0]

n_of_entities = len(entity2uid_dict)
n_of_chunks = len(document_chunks_with_entities_and_triples)

for e_idx in range(n_of_entities):
    entity_name = list(entity2uid_dict.keys())[list(entity2uid_dict.values()).index(e_idx)]
    logger.info(f"Entity: {e_idx} {entity_name} Per chunk count: {matrix[e_idx][:]}")



2024-11-05 11:40:20 R2GWRJJGF9 root[40146] INFO Entity: 0 stanford university Per chunk count: [1.]
2024-11-05 11:40:20 R2GWRJJGF9 root[40146] INFO Entity: 1 the ohio state university Per chunk count: [4.]
2024-11-05 11:40:20 R2GWRJJGF9 root[40146] INFO Entity: 2 yu su Per chunk count: [1.]
2024-11-05 11:40:20 R2GWRJJGF9 root[40146] INFO Entity: 3 hippocampal memory indexing theory Per chunk count: [1.]
2024-11-05 11:40:20 R2GWRJJGF9 root[40146] INFO Entity: 4 hippocampus Per chunk count: [2.]
2024-11-05 11:40:20 R2GWRJJGF9 root[40146] INFO Entity: 5 medical diagnosis Per chunk count: [0.]
2024-11-05 11:40:20 R2GWRJJGF9 root[40146] INFO Entity: 6 retrieval-augmented generation (rag) Per chunk count: [0.]
2024-11-05 11:40:20 R2GWRJJGF9 root[40146] INFO Entity: 7 ircot Per chunk count: [2.]
2024-11-05 11:40:20 R2GWRJJGF9 root[40146] INFO Entity: 8 large language models (llms) Per chunk count: [2.]
2024-11-05 11:40:20 R2GWRJJGF9 root[40146] INFO Entity: 9 neurobiologically inspired long-t

## Save the matrix to a file

In [32]:
with open(f"{document_name.rsplit(".", 1)[0]}_entity_per_chunk_count_matrix.pkl", "wb") as f:
    pickle.dump(matrix, f)
logger.info(f"Entity per chunk count matrix has been saved to {document_name.rsplit(".", 1)[0]}_entity_per_chunk_count_matrix.pkl")

2024-11-05 11:40:23 R2GWRJJGF9 root[40146] INFO Entity per chunk count matrix has been saved to ../2405.14831v1_entity_per_chunk_count_matrix.pkl


# Index Creation

In [16]:
# Embeddings and FAISS index params
emb_dimension = 256
recall_at_k = 3  # how far in the indices/distances we go

# M_max defines the maximum number of links a vertex can have, and M_max0, which defines the same but for vertices in layer 0.
M = 64  # for HNSW index, the number of neighbors we add to each vertex on insertion. 
# Faiss sets M_max and M_max0 automatically in the set_default_probas method, at index initialization. 
# The M_max value is set to M, and M_max0 set to M*2

## Generating embeddings for named entities in document


In [11]:
named_entities_dict = pickle.loads(open(f"{document_name.rsplit(".", 1)[0]}_entity2uid_dict.pkl", "rb").read())
logger.info(f"Named entities dict loaded: {named_entities_dict}")
entities = list(named_entities_dict.keys())
logger.info(f"Number of entities: {len(entities)}. First entity is: {entities[0]}")

2024-11-05 13:12:26 R2GWRJJGF9 root[36374] INFO Named entities dict loaded: {'stanford university': 0, 'the ohio state university': 1, 'yu su': 2, 'hippocampal memory indexing theory': 3, 'hippocampus': 4, 'medical diagnosis': 5, 'retrieval-augmented generation (rag)': 6, 'ircot': 7, 'large language models (llms)': 8, 'neurobiologically inspired long-term memory for large language models': 9, 'neocortex': 10, 'scientific literature review': 11, 'mammalian brains': 12, 'hipporag': 13, 'store knowledge': 14, 'bernal jiménez gutiérrez': 15, 'legal case briefing': 16, 'yu gu': 17, 'multi-hop question answering (qa)': 18, 'personalized pagerank': 19, 'human memory': 20, 'state-of-the-art methods': 21, 'new experiences': 22, 'long-term memory in llms': 23, 'yiheng shu': 24, 'alzheimer’s': 25, 'existing rag methods': 26, 'michihiro yasunaga': 27, 'prof. thomas': 28, 'c-shaped hippocampus': 29, 'knowledge integration': 30}
2024-11-05 13:12:26 R2GWRJJGF9 root[36374] INFO Number of entities: 31.

### Generate (and save) entity embeddings and convert them to np


In [8]:
embeddings_filepath = f"{document_name.rsplit('.', 1)[0]}_entity_embeddings.pkl"

In [None]:

entities_embeddings = generate_entity_embeddings(entities, emb_dimension, embeddings_filepath)


### Checkpoint Step: Load the entity Embeddings (Just to continue from here)

In [9]:
entities_embeddings = pickle.loads(open(embeddings_filepath, "rb").read())
entities_embeddings = np.array(entities_embeddings)

## Create Index and Query It with the same elements we indexed it

In [17]:
faiss_index = create_index(entities_embeddings, emb_dimension, M)
distances, indexes = search_index(faiss_index, entities_embeddings, recall_at_k)

2024-11-05 13:14:53 R2GWRJJGF9 root[36374] INFO 
Distances:
[[0.    0.985 1.282]
 [0.    0.985 1.469]
 [0.    0.799 0.932]
 [0.    0.858 1.041]
 [0.    0.763 0.858]
 [0.    1.318 1.383]
 [0.    1.022 1.259]
 [0.    1.392 1.396]
 [0.    0.748 0.76 ]
 [0.    0.699 0.76 ]
 [0.    1.014 1.136]
 [0.    1.276 1.406]
 [0.    1.161 1.164]
 [0.    0.882 1.227]
 [0.    0.853 1.268]
 [0.    1.298 1.483]
 [0.    1.428 1.521]
 [0.    0.799 0.927]
 [0.    1.313 1.33 ]
 [0.    1.345 1.391]
 [0.    1.046 1.05 ]
 [0.    1.276 1.278]
 [0.    1.278 1.281]
 [0.    0.699 0.748]
 [0.    0.927 0.932]
 [0.    1.158 1.268]
 [0.    1.022 1.279]
 [0.    1.063 1.274]
 [0.    1.282 1.298]
 [0.    0.763 1.066]
 [0.    0.853 1.26 ]]
Indices:
[[ 0  1 28]
 [ 1  0 28]
 [ 2 17 24]
 [ 3  4 23]
 [ 4 29  3]
 [ 5 25  2]
 [ 6 26 13]
 [ 7 19  6]
 [ 8 23  9]
 [ 9 23  8]
 [10  4 29]
 [11 21  5]
 [12 23  4]
 [13  4 29]
 [14 30  3]
 [15 28  2]
 [16 11 28]
 [17  2 24]
 [18  8  6]
 [19  6  3]
 [20  4  3]
 [21 11 22]
 [22 21 14]
 [2

## Build Similar Entities with Recall at K

In [19]:
similar_entities = build_similar_entities(entities, indexes, distances, recall_at_k, max_distance=0.85)  # Original max_distance=0.7
logger.info(f"Similar entities:\n{similar_entities}")

# TODO Scores discarded for now
# scores = calculate_scores(distances)
# similar_entities_score = build_similar_entities_with_scores(entities, indexes, scores, recall_at_k, min_score=0.5)            
# logger.info(similar_entities_score)

2024-11-05 13:15:04 R2GWRJJGF9 root[36374] INFO Similarity (<=0.85 dist) found for yu su (2) with yu gu (17): Distance 0.7987977266311646
2024-11-05 13:15:04 R2GWRJJGF9 root[36374] INFO Similarity (<=0.85 dist) found for hippocampus (4) with c-shaped hippocampus (29): Distance 0.7634723782539368
2024-11-05 13:15:04 R2GWRJJGF9 root[36374] INFO Similarity (<=0.85 dist) found for large language models (llms) (8) with long-term memory in llms (23): Distance 0.7482843399047852
2024-11-05 13:15:04 R2GWRJJGF9 root[36374] INFO Similarity (<=0.85 dist) found for large language models (llms) (8) with neurobiologically inspired long-term memory for large language models (9): Distance 0.7598475217819214
2024-11-05 13:15:04 R2GWRJJGF9 root[36374] INFO Similarity (<=0.85 dist) found for neurobiologically inspired long-term memory for large language models (9) with long-term memory in llms (23): Distance 0.6987998485565186
2024-11-05 13:15:04 R2GWRJJGF9 root[36374] INFO Similarity (<=0.85 dist) found

# Create Neo4J Graph

In [2]:
NEO4J_URI = os.environ["NEO4J_URI"]
NEO4J_USERNAME = os.environ["NEO4J_USERNAME"]
NEO4J_PASSWORD = os.environ["NEO4J_PASSWORD"]
NEO4J_DATABASE = os.environ["NEO4J_DB"] 

kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

Required to create the graph:
    - entity_embeddings
    - named entities dict 
    - doc structure
    - similar entities

## Step 1: Add all entities to the graph

In [3]:
clean_db(kg)

2024-11-05 13:10:28 R2GWRJJGF9 root[36374] INFO Cleaning Neo4j database
2024-11-05 13:10:29 R2GWRJJGF9 root[36374] INFO Result after deleting nodes:
[]
2024-11-05 13:10:29 R2GWRJJGF9 root[36374] INFO Result after properties:
[]


In [12]:
add_entities(kg, entities_embeddings, named_entities_dict)

## Step 2: Add RELATES_TO relationships

In [13]:
doc_structure = pickle.loads(open(f"{document_name.rsplit(".", 1)[0]}_document_structure_with_ne.pkl", "rb").read())
add_relates_to_relationships(kg, doc_structure)


2024-11-05 13:13:02 R2GWRJJGF9 neo4j.notifications[36374] INFO Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Statement.CartesianProduct} {category: PERFORMANCE} {title: This query builds a cartesian product between disconnected patterns.} {description: If a part of a query contains multiple disconnected patterns, this will build a cartesian product between all those parts. This may produce a large amount of data and slow down query processing. While occasionally intended, it may often be possible to reformulate the query that avoids the use of this cross product, perhaps by adding a relationship between the different parts or by using OPTIONAL MATCH (identifier is: (b))} {position: line: 3, column: 5, offset: 37} for query: '\n    UNWIND $triplets AS triplet\n    MATCH (a:Entity {name: triplet.subject}), (b:Entity {name: triplet.object})\n    MERGE (a)-[:RELATES_TO {type: triplet.predicate}]->(b)\n    '


## Step 3: Add SIMILAR_TO relationships

In [20]:
add_similar_entities(kg, similar_entities)

2024-11-05 13:15:08 R2GWRJJGF9 neo4j.notifications[36374] INFO Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Statement.CartesianProduct} {category: PERFORMANCE} {title: This query builds a cartesian product between disconnected patterns.} {description: If a part of a query contains multiple disconnected patterns, this will build a cartesian product between all those parts. This may produce a large amount of data and slow down query processing. While occasionally intended, it may often be possible to reformulate the query that avoids the use of this cross product, perhaps by adding a relationship between the different parts or by using OPTIONAL MATCH (identifier is: (b))} {position: line: 3, column: 5, offset: 40} for query: '\n    UNWIND $similar_entities AS se\n    MATCH (a:Entity {name: se.entity}), (b:Entity {name: se.similar_entity})\n    MERGE (a)-[:SIMILAR_TO]->(b)\n    '


## Step 4: Build vector index

In [22]:
build_vector_index(kg, emb_dim=emb_dimension)

2024-11-05 13:23:47 R2GWRJJGF9 neo4j.notifications[36374] INFO Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Schema.IndexOrConstraintAlreadyExists} {category: SCHEMA} {title: `CREATE VECTOR INDEX entityIdx IF NOT EXISTS FOR (e:Entity) ON (e.embedding) OPTIONS {indexConfig: {`vector.dimensions`: $emb_dim, `vector.similarity_function`: $sim_func}}` has no effect.} {description: `VECTOR INDEX entityIdx FOR (e:Entity) ON (e.embedding)` already exists.} {position: None} for query: '\n    CREATE VECTOR INDEX $idx_name IF NOT EXISTS\n    FOR (m:Entity)\n    ON m.embedding\n    OPTIONS {indexConfig: {\n        `vector.dimensions`: $emb_dim,\n        `vector.similarity_function`: $sim_func\n    }}\n    '
