In [1]:
import json

data = {
    "name": "John Doe",
    "message": "Hello, \"world\"!\nHow's it going?"
}

# Escaping the JSON data using json.dumps
json.dumps(data)

'{"name": "John Doe", "message": "Hello, \\"world\\"!\\nHow\'s it going?"}'

# Exploring similarity

In [3]:
import faiss

from typing import List, Optional

from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings

import numpy as np
import pickle


from llm_foundation import logger



document_name: str = "2405.14831v1.pdf"

# Embeddings and FAISS index params
emb_dimension = 256
M = 32

# M_max defines the maximum number of links a vertex can have, and M_max0, which defines the same but for vertices in layer 0.
M = 64  # for HNSW index, the number of neighbors we add to each vertex on insertion. 
# Faiss sets M_max and M_max0 automatically in the set_default_probas method, at index initialization. 
# The M_max value is set to M, and M_max0 set to M*2

2024-10-27 22:57:02 R2GWRJJGF9 root[55402] INFO Logger root configured


Trying to configure logger root in module llm_foundation
root # of associated handlers - 0
Logging is not configured yet. Configuring it now.
Basic logging config


In [29]:
named_entities_dict = pickle.loads(open(f"{document_name.rsplit(".", 1)[0]}_entity2uid_dict.pkl", "rb").read())
print(named_entities_dict)
entities = list(named_entities_dict.keys())


{'bernal jiménez gutiérrez': 0, 'shu.251@osu.edu': 1, 'myasu@cs.stanford.edu': 2, 'mammalian': 3, 'rag': 4, 'michihiro yasunaga': 5, 'hippocampal indexing': 6, 'su.809@osu.edu': 7, 'stanford university': 8, 'mammalian brains': 9, 'retrieval-augmented generation': 10, 'hipporag': 11, 'the ohio state university': 12, 'gu.826@osu.edu': 13, 'yu gu': 14, 'yiheng shu': 15, 'jimenezgutierrez.1@osu.edu': 16, 'yu su': 17}


In [30]:
entities[0]

'bernal jiménez gutiérrez'

In [None]:

logger.info(f"Entities: {entities}")
logger.info(f"{len(entities)} entities loaded in the document")

def generate_entity_embeddings(entities: list, emb_dimension: int = 256, emb_save_path: Optional[str] = None):
    openai_embeddings = OpenAIEmbeddings(model="text-embedding-3-small", dimensions=emb_dimension)
    entities_embeddings = openai_embeddings.embed_documents(entities)  # Embeddings are a list of list of floats
    logger.debug(f"Entities embeddings: {entities_embeddings}")
    
    if emb_save_path:
        with open(emb_save_path, "wb") as f:
            pickle.dump(entities_embeddings, f)
    
    return entities_embeddings



In [None]:

# Generate entity embeddings
embeddings_saving_path = f"{document_name.rsplit('.', 1)[0]}_entity_embeddings.pkl"
entities_embeddings = generate_entity_embeddings(entities, emb_dimension, embeddings_saving_path)


# Retrieve Document Embeddings

In [19]:
entities_embeddings = pickle.loads(open(f"{document_name.rsplit('.', 1)[0]}_entity_embeddings.pkl", "rb").read())
# entities_embeddings

In [55]:

embs_as_nparrays = np.array(entities_embeddings)

def create_index(vectors: np.ndarray, emb_dimension: int, M: int) -> faiss.IndexHNSWFlat:
    # See https://www.pinecone.io/learn/series/faiss/hnsw/ for info about HNSW
    # See also https://bakingai.com/blog/hnsw-semantic-search-faiss-integration/
    faiss_index = faiss.IndexHNSWFlat(emb_dimension, M)

    # convert to numpy array

    faiss_index.add(vectors)  # Build the index

    return faiss_index

def search_index(faiss_index: faiss.IndexHNSWFlat, query: list, recall_at_k: int) -> tuple:
    distances, indices = faiss_index.search(query, recall_at_k)
    logger.info(f"\nDistances:\n{np.round(distances, 3)}\nIndices:\n{indices}")
    
    return distances, indices

def calculate_scores(distances: list) -> list:
    scores = 1 / (1 + distances)  # Inverting distances to get similarity scores.
    print("Similarity Scores: ", scores)
    return scores

def build_similar_entities(entities: list, indices: list, distances: list, recall_at_k: int, max_distance: float=0.7) -> list:
    similar_entities = []
    for idx, entity in enumerate(entities):
        for i in range(recall_at_k):
            if indices[idx][i] != idx and distances[idx][i] <= max_distance:
                logger.info(f"Similarity (<={max_distance} dist) found for {entity} ({idx}) with {entities[indices[idx][i]]} ({indices[idx][i]}): Distance {distances[idx][i]}")
                similar_entities.append(
                    {
                        "entity": entity, 
                        "similar_entity": entities[indices[idx][i]]
                    }
                )
    return similar_entities

# https://medium.com/@asakisakamoto02/how-to-use-faiss-similarity-search-with-score-explained-99ea3fe964cf
def build_similar_entities_with_scores(entities: list, indices: list, scores: list, recall_at_k: int, min_score: float=0.5) -> list:
    similar_entities = []
    for idx, entity in enumerate(entities):
        for i in range(recall_at_k):
            if indices[idx][i] != idx and scores[idx][i] > min_score:
                logger.info(f"Similarity (>{min_score} score) found for {entity} ({idx}) with {entities[indices[idx][i]]} ({indices[idx][i]}): Score {scores[idx][i]}")
                similar_entities.append(
                    {
                        "entity": entity, 
                        "similar_entity": entities[indices[idx][i]]
                    }
                )
    return similar_entities


recall_at_k = 3  # how far in the indices/distances we go

faiss_index = create_index(embs_as_nparrays, emb_dimension, M)
# We query with the same elements we indexed
distances, indexes = search_index(faiss_index, embs_as_nparrays, recall_at_k)
similar_entities = build_similar_entities(entities, indexes, distances, recall_at_k, max_distance=0.85)  # Original max_distance=0.7

similar_entities_score = build_similar_entities(entities, indexes, distances, recall_at_k, max_distance=0.85)  # Original max_distance=0.7
            
print(similar_entities)


2024-10-27 18:41:22 R2GWRJJGF9 root[41795] INFO 
Distances:
[[0.    0.795 0.823]
 [0.    0.995 1.198]
 [0.    0.985 1.062]
 [0.    0.799 0.927]
 [0.    1.254 1.278]
 [0.    1.063 1.199]
 [0.    0.95  1.424]
 [0.    0.337 0.477]
 [0.    0.995 1.118]
 [0.    0.337 0.467]
 [0.    0.626 1.198]
 [0.    0.467 0.477]
 [0.    0.843 0.883]
 [0.    0.799 0.932]
 [0.    1.118 1.278]
 [0.    0.525 0.546]
 [0.    0.626 1.394]
 [0.    0.927 0.932]]
Indices:
[[ 0  9  7]
 [ 1  8 10]
 [ 2  0 12]
 [ 3 13 17]
 [ 4  1 14]
 [ 5 13 12]
 [ 6 15 12]
 [ 7  9 11]
 [ 8  1 14]
 [ 9  7 11]
 [10 16  1]
 [11  9  7]
 [12  9 11]
 [13  3 17]
 [14  8  4]
 [15  9  7]
 [16 10  8]
 [17  3 13]]
2024-10-27 18:41:22 R2GWRJJGF9 root[41795] INFO Similarity (<=0.85 dist) found for bernal jiménez gutiérrez (0) with mammalian brains (9): Distance 0.7946487069129944
2024-10-27 18:41:22 R2GWRJJGF9 root[41795] INFO Similarity (<=0.85 dist) found for bernal jiménez gutiérrez (0) with su.809@osu.edu (7): Distance 0.8227523565292358
202

[{'entity': 'bernal jiménez gutiérrez', 'similar_entity': 'mammalian brains'}, {'entity': 'bernal jiménez gutiérrez', 'similar_entity': 'su.809@osu.edu'}, {'entity': 'mammalian', 'similar_entity': 'gu.826@osu.edu'}, {'entity': 'su.809@osu.edu', 'similar_entity': 'mammalian brains'}, {'entity': 'su.809@osu.edu', 'similar_entity': 'hipporag'}, {'entity': 'mammalian brains', 'similar_entity': 'su.809@osu.edu'}, {'entity': 'mammalian brains', 'similar_entity': 'hipporag'}, {'entity': 'retrieval-augmented generation', 'similar_entity': 'jimenezgutierrez.1@osu.edu'}, {'entity': 'hipporag', 'similar_entity': 'mammalian brains'}, {'entity': 'hipporag', 'similar_entity': 'su.809@osu.edu'}, {'entity': 'the ohio state university', 'similar_entity': 'mammalian brains'}, {'entity': 'gu.826@osu.edu', 'similar_entity': 'mammalian'}, {'entity': 'yiheng shu', 'similar_entity': 'mammalian brains'}, {'entity': 'yiheng shu', 'similar_entity': 'su.809@osu.edu'}, {'entity': 'jimenezgutierrez.1@osu.edu', 'si

In [40]:
scores = calculate_scores(distances)

similar_entities_score = build_similar_entities_with_scores(entities, indexes, scores, recall_at_k, min_score=0.5)  # Original max_distance=0.7
            
print(similar_entities_score)

2024-10-27 12:36:56 R2GWRJJGF9 root[70220] INFO Similarity (>0.5 score) found for the ohio state university (0) with su.809@osu.edu (9): Score 0.5572121143341064
2024-10-27 12:36:56 R2GWRJJGF9 root[70220] INFO Similarity (>0.5 score) found for the ohio state university (0) with gu.826@osu.edu (7): Score 0.548620879650116
2024-10-27 12:36:56 R2GWRJJGF9 root[70220] INFO Similarity (>0.5 score) found for hippocampal indexing (1) with hipporag (8): Score 0.5012723207473755
2024-10-27 12:36:56 R2GWRJJGF9 root[70220] INFO Similarity (>0.5 score) found for stanford university (2) with the ohio state university (0): Score 0.5038489103317261
2024-10-27 12:36:56 R2GWRJJGF9 root[70220] INFO Similarity (>0.5 score) found for yu gu (3) with yu su (13): Score 0.5559268593788147
2024-10-27 12:36:56 R2GWRJJGF9 root[70220] INFO Similarity (>0.5 score) found for yu gu (3) with yiheng shu (17): Score 0.5188145041465759
2024-10-27 12:36:56 R2GWRJJGF9 root[70220] INFO Similarity (>0.5 score) found for bern

Similarity Scores:  [[1.         0.5572121  0.5486209 ]
 [1.         0.5012723  0.45496228]
 [1.         0.5038489  0.48497996]
 [1.         0.55592686 0.5188145 ]
 [1.         0.44373468 0.4389289 ]
 [1.         0.48466164 0.45469397]
 [1.         0.5129136  0.4124956 ]
 [1.         0.748004   0.6771923 ]
 [1.         0.5012723  0.47223186]
 [1.         0.748004   0.6814926 ]
 [1.         0.61486983 0.45496228]
 [1.         0.6814926  0.6771923 ]
 [1.         0.5425388  0.5310488 ]
 [1.         0.55592686 0.51753026]
 [1.         0.47223186 0.4389289 ]
 [1.         0.6557992  0.64700246]
 [1.         0.61486983 0.41763887]
 [1.         0.5188145  0.51753026]]
[{'entity': 'the ohio state university', 'similar_entity': 'su.809@osu.edu'}, {'entity': 'the ohio state university', 'similar_entity': 'gu.826@osu.edu'}, {'entity': 'hippocampal indexing', 'similar_entity': 'hipporag'}, {'entity': 'stanford university', 'similar_entity': 'the ohio state university'}, {'entity': 'yu gu', 'similar

# Individual similarities

In [53]:
logger.info(entities)

bernal = "Jiménez Gutiérrez".lower()


openai_embeddings = OpenAIEmbeddings(model="text-embedding-3-small", dimensions=emb_dimension)
bernal_embeddings = openai_embeddings.embed_documents([bernal])  # Embeddings are a list of list of floats
bernal_as_nparrays = np.array(bernal_embeddings)


2024-10-27 12:53:55 R2GWRJJGF9 root[70220] INFO ['the ohio state university', 'hippocampal indexing', 'stanford university', 'yu gu', 'retrieval-augmented generation', 'michihiro yasunaga', 'bernal jiménez gutiérrez', 'gu.826@osu.edu', 'hipporag', 'su.809@osu.edu', 'mammalian brains', 'shu.251@osu.edu', 'myasu@cs.stanford.edu', 'yu su', 'rag', 'jimenezgutierrez.1@osu.edu', 'mammalian', 'yiheng shu']
2024-10-27 12:53:55 R2GWRJJGF9 httpx[70220] INFO HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [54]:

distances, indexes = search_index(faiss_index, bernal_as_nparrays, recall_at_k)

# Distance wise calculation
for idx, entity in enumerate([bernal]):
    for i in range(recall_at_k):
        if entity != entities[indexes[idx][i]] and distances[idx][i] <= 0.95:
            logger.info(f"Similarity found for {bernal} with {entities[indexes[idx][i]]} ({indexes[idx][i]}): Distance {distances[idx][i]}")


import numpy as np
# Score wise calculation
scores = 1 / (1 + distances)  # Inverting distances to get similarity scores.
print("Similarity Scores: ", np.round(scores, 3))

for idx, entity in enumerate([bernal]):
    for i in range(recall_at_k):
        if entity != entities[indexes[idx][i]] and scores[idx][i] > 0.5:
            logger.info(f"Similarity found for {bernal} with {entities[indexes[idx][i]]} ({indexes[idx][i]}): Score {round(scores[idx][i], 3)}")


2024-10-27 12:53:55 R2GWRJJGF9 root[70220] INFO 
Distances:
[[0.299 0.761 1.343]]
Indices:
[[ 6 15  3]]
2024-10-27 12:53:55 R2GWRJJGF9 root[70220] INFO Similarity found for jiménez gutiérrez with bernal jiménez gutiérrez (6): Distance 0.2989356815814972
2024-10-27 12:53:55 R2GWRJJGF9 root[70220] INFO Similarity found for jiménez gutiérrez with jimenezgutierrez.1@osu.edu (15): Distance 0.7609552145004272
2024-10-27 12:53:55 R2GWRJJGF9 root[70220] INFO Similarity found for jiménez gutiérrez with bernal jiménez gutiérrez (6): Score 0.7699999809265137
2024-10-27 12:53:55 R2GWRJJGF9 root[70220] INFO Similarity found for jiménez gutiérrez with jimenezgutierrez.1@osu.edu (15): Score 0.5680000185966492


Similarity Scores:  [[0.77  0.568 0.427]]


# Neo4j Exploration

In [1]:
import os
from langchain_community.graphs import Neo4jGraph

NEO4J_URI = os.environ["NEO4J_URI"]
NEO4J_USERNAME = os.environ["NEO4J_USERNAME"]
NEO4J_PASSWORD = os.environ["NEO4J_PASSWORD"]
NEO4J_DATABASE = "neo4j"

print(NEO4J_URI)

neo4j+s://492f27e5.databases.neo4j.io


In [4]:
kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

logger.info(type(kg))


2024-10-27 22:57:10 R2GWRJJGF9 root[55402] INFO <class 'langchain_community.graphs.neo4j_graph.Neo4jGraph'>


# Clean Database

In [10]:
query = """
MATCH (n)
DETACH DELETE n
"""

result = kg.query(query)
print(result)


query = """
MATCH (n)
CALL apoc.meta.nodeTypeProperties(n) YIELD propertyName
REMOVE n[propertyName]
"""

result = kg.query(query)
print(result)


[]
[]


CypherSyntaxError: {code: Neo.ClientError.Statement.SyntaxError} {message: Invalid input '': expected an expression, '.', ':', 'IS' or '[' (line 4, column 1 (offset: 25))
"REMOVE Entity"
              ^}

In [39]:
named_entities_dict

{'bernal jiménez gutiérrez': 0,
 'shu.251@osu.edu': 1,
 'myasu@cs.stanford.edu': 2,
 'mammalian': 3,
 'rag': 4,
 'michihiro yasunaga': 5,
 'hippocampal indexing': 6,
 'su.809@osu.edu': 7,
 'stanford university': 8,
 'mammalian brains': 9,
 'retrieval-augmented generation': 10,
 'hipporag': 11,
 'the ohio state university': 12,
 'gu.826@osu.edu': 13,
 'yu gu': 14,
 'yiheng shu': 15,
 'jimenezgutierrez.1@osu.edu': 16,
 'yu su': 17}

In [38]:
from typing import List

def add_all_entities(kg: Neo4jGraph, named_entities_dict: Dict):

    entities = list(named_entities_dict.keys())

    all_entities = [{"name": entity, "node_id": named_entities_dict[entity], "embedding": entities_embeddings[named_entities_dict[entity]]} for entity in entities]

    query = """
    UNWIND $all_entities AS ae
    MERGE (a:Entity {name: ae.name, node_id: ae.node_id, embedding: ae.embedding})
    """
    kg.query(query, {"all_entities": all_entities})



add_all_entities(kg, named_entities_dict)


2024-10-27 14:59:48 R2GWRJJGF9 neo4j.io[41795] ERROR Failed to read from defunct connection ResolvedIPv4Address(('34.28.184.63', 7687)) (ResolvedIPv4Address(('34.28.184.63', 7687)))
2024-10-27 14:59:48 R2GWRJJGF9 neo4j.pool[41795] ERROR Unable to retrieve routing information


In [41]:
def add_relateto_relationships(kg: Neo4jGraph, doc_structure):
    triplets = []
    for chunk in doc_structure:
        for triple in chunk["triples"]:
            if len(triple) != 3:
                continue
            subject=triple[0].lower()
            predicate=triple[1].replace(" ", "_").upper()
            object=triple[2].lower()
            triplets.append({
                "subject": subject, 
                "predicate": predicate, 
                "object": object,
                "passageId_subject": chunk["id"],
                "passageId_object": chunk["id"],
            })
    
    query = """
    UNWIND $triplets AS triplet
    MATCH (a:Entity {name: triplet.subject}), (b:Entity {name: triplet.object})
    MERGE (a)-[:RELATES_TO {type: triplet.predicate}]->(b)
    """
    kg.query(query, {"triplets": triplets})
    
doc_structure = pickle.loads(open(f"{document_name.rsplit(".", 1)[0]}_document_structure.pkl", "rb").read())

add_relateto_relationships(kg, doc_structure)


2024-10-27 17:43:13 R2GWRJJGF9 neo4j.io[41795] ERROR Failed to read from defunct connection ResolvedIPv4Address(('34.28.184.63', 7687)) (ResolvedIPv4Address(('34.28.184.63', 7687)))
2024-10-27 17:43:13 R2GWRJJGF9 neo4j.pool[41795] ERROR Unable to retrieve routing information
2024-10-27 17:43:16 R2GWRJJGF9 neo4j.notifications[41795] INFO Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Statement.CartesianProduct} {category: PERFORMANCE} {title: This query builds a cartesian product between disconnected patterns.} {description: If a part of a query contains multiple disconnected patterns, this will build a cartesian product between all those parts. This may produce a large amount of data and slow down query processing. While occasionally intended, it may often be possible to reformulate the query that avoids the use of this cross product, perhaps by adding a relationship between the different parts or by using OPTIONAL MATCH (identifier is: (b

In [53]:
def build_vector_index(kg: Neo4jGraph, idx_name = "entityIdx", emb_dim=256, sim_func='cosine'):
    query = """
    CREATE VECTOR INDEX $idx_name IF NOT EXISTS
    FOR (m:Entity)
    ON m.embedding
    OPTIONS {indexConfig: {
        `vector.dimensions`: $emb_dim,
        `vector.similarity_function`: $sim_func
    }}
    """
    kg.query(query, {'emb_dim': emb_dim, 'sim_func': sim_func})

build_vector_index(kg)

In [56]:
def add_similar_entities(kg: Neo4jGraph, similar_entities: List):
    query = """
    UNWIND $similar_entities AS se
    MATCH (a:Entity {name: se.entity}), (b:Entity {name: se.similar_entity})
    MERGE (a)-[:SIMILAR_TO]->(b)
    """
    kg.query(query, {"similar_entities": similar_entities})

add_similar_entities(kg, similar_entities)

2024-10-27 18:42:01 R2GWRJJGF9 neo4j.notifications[41795] INFO Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Statement.CartesianProduct} {category: PERFORMANCE} {title: This query builds a cartesian product between disconnected patterns.} {description: If a part of a query contains multiple disconnected patterns, this will build a cartesian product between all those parts. This may produce a large amount of data and slow down query processing. While occasionally intended, it may often be possible to reformulate the query that avoids the use of this cross product, perhaps by adding a relationship between the different parts or by using OPTIONAL MATCH (identifier is: (b))} {position: line: 3, column: 5, offset: 40} for query: '\n    UNWIND $similar_entities AS se\n    MATCH (a:Entity {name: se.entity}), (b:Entity {name: se.similar_entity})\n    MERGE (a)-[:SIMILAR_TO]->(b)\n    '


In [54]:
kg.query("show indexes")

[{'id': 2,
  'name': 'entityIdx',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Entity'],
  'properties': ['embedding'],
  'indexProvider': 'vector-2.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': 0},
 {'id': 0,
  'name': 'index_343aff4e',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'NODE',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': neo4j.time.DateTime(2024, 10, 28, 1, 20, 35, 138000000, tzinfo=<UTC>),
  'readCount': 773},
 {'id': 1,
  'name': 'index_f7700477',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'RELATIONSHIP',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': neo4j.time.DateTime(2024, 10, 28, 0, 47, 30, 242000000, tzinfo=<UTC>),
  'readCount':