In [None]:
from groq import Groq
from llama_index.core import Settings, ServiceContext, StorageContext, SimpleDirectoryReader, PropertyGraphIndex
from llama_index.llms.groq import Groq as Groq_llamaindex
from llama_index.llms.openai import OpenAI
from llama_index.llms.replicate import Replicate
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.graph_stores.neo4j import Neo4jGraphStore, Neo4jPropertyGraphStore
from llama_index.core.indices.property_graph import VectorContextRetriever
from llama_index.core import VectorStoreIndex
from llama_index.core.agent import ReActAgent, FunctionCallingAgentWorker, AgentRunner
from llama_index.core.tools import BaseTool, FunctionTool
from milvus import default_server
from dotenv import load_dotenv
import json
import os
import numpy as np
import os
BASE_ENTITY_LABEL = "__Entity__"
BASE_NODE_LABEL = "__Node__"

# Retrieve API keys and credentials securely
GROQ_API_KEY = os.getenv('GROQ_API_KEY')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_URL = os.getenv('NEO4J_URL', 'bolt://localhost:7687')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE', 'neo4j')
REPLICATE_API_KEY = os.getenv('REPLICATE_API_KEY')
os.environ["REPLICATE_API_KEY"] = REPLICATE_API_KEY

#Initialize the Replicate class
llm = Replicate(
    model="meta/meta-llama-3-70b-instruct"
)

Settings.llm = llm
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

ServiceContext.llm = llm

import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

property_graph_store = Neo4jPropertyGraphStore(
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    url=NEO4J_URL,
    database=NEO4J_DATABASE,
)
storage_context = StorageContext.from_defaults(property_graph_store=property_graph_store)


index = PropertyGraphIndex.from_existing(
    property_graph_store=property_graph_store,
    llm=llm,
    embed_model=Settings.embed_model,
)




INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5
Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5
INFO:sentence_transformers.SentenceTransformer:2 prompts are loaded, with the keys: ['query', 'text']
2 prompts are loaded, with the keys: ['query', 'text']
INFO:neo4j.notifications:Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Schema.IndexOrConstraintAlreadyExists} {category: SCHEMA} {title: `CREATE CONSTRAINT IF NOT EXISTS FOR (e:__Node__) REQUIRE (e.id) IS UNIQUE` has no effect.} {description: `CONSTRAINT constraint_ec67c859 FOR (e:__Node__) REQUIRE (e.id) IS UNIQUE` already exists.} {position: None} for query: 'CREATE CONSTRAINT IF NOT EXISTS FOR (n:`__Node__`)\n            REQUIRE n.id IS UNIQUE;'
Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Schema.IndexOrConstraintAlreadyExists} {category: SCHEMA} {title: `CREATE CONSTRA

In [None]:
class DummnQuery:
    def __init__(self, query_str, embedding):
        self.query_str = query_str
        self.filters = None
        self.query_embedding = embedding
        self.similarity_top_k = 4



In [4]:
# Load questions sp_cot_data/pseudo_dataset_completion.json
import json
with open('transport_data/sp_cot/pseudo_dataset_completion.json') as f:
    data = json.load(f)



In [5]:
types = data.keys()
questions = []
answers = []
for type in types:
    for entry in data[type]:
        questions.append(entry['question'])
        answers.append(entry['answer'])


In [None]:
for current_depth in range(1, 7):
    similarity_top_k = 4    
    path_depth = current_depth
    vector_retriever = VectorContextRetriever(
    index.property_graph_store,
    # only needed when the graph store doesn't support vector queries
    # vector_store=index.vector_store,
    embed_model=Settings.embed_model,
    # include source chunk text with retrieved paths
    include_text=True,
    # the number of nodes to fetch
    similarity_top_k=similarity_top_k,
    # the depth of relations to follow after node retrieval
    path_depth=path_depth,
    
  
    )

    #retriever = index.as_retriever(sub_retrievers=[vector_retriever])
    index_query_engine = index.as_query_engine(sub_retrievers=[vector_retriever])
    index_retriever = index.as_retriever(sub_retrievers=[vector_retriever])

    import nest_asyncio
    nest_asyncio.apply()
    
    for  i, question in enumerate(questions):
        

       
        query_str = question
        print(query_str)
        embedding = index_retriever.sub_retrievers[0]._embed_model.get_agg_embedding_from_queries([query_str])
        query = DummnQuery(query_str, embedding)

    
        print(query.query_str)
        nodes_with_score = property_graph_store.vector_query(query)
        ids = []
        for node in nodes_with_score[0]:
            ids.append(node.name)
     

        response_unsorted = property_graph_store.structured_query(
        f"""
        WITH $ids AS id_list
        UNWIND range(0, size(id_list) - 1) AS idx
        MATCH (e:`{BASE_ENTITY_LABEL}`)
        WHERE e.id = id_list[idx]
        MATCH p=(e)-[r*1..{path_depth}]-(other)
        WHERE ALL(rel IN relationships(p) WHERE type(rel) <> 'MENTIONS')
        UNWIND relationships(p) AS rel
        WITH DISTINCT rel, idx, id_list[idx] AS seed_id, vector.similarity.cosine(rel.embedding, $embedding) AS rel_score
        WITH startNode(rel) AS source,
            type(rel) AS type,
            rel{{.*}} AS rel_properties,
            endNode(rel) AS endNode,
            idx,
            rel_score,
            seed_id
        LIMIT toInteger($limit)
        RETURN source.id AS source_id, 
            [l IN labels(source) WHERE NOT l IN ['{BASE_ENTITY_LABEL}', '{BASE_NODE_LABEL}'] | l][0] AS source_type,
            source{{.*, embedding: Null, id: Null}} AS source_properties,
            type,
            rel_properties,
            endNode.id AS target_id, 
            [l IN labels(endNode) WHERE NOT l IN ['{BASE_ENTITY_LABEL}', '{BASE_NODE_LABEL}'] | l][0] AS target_type,
            endNode{{.*, embedding: Null, id: Null}} AS target_properties,
            idx,
            rel_score,
            seed_id
        ORDER BY idx
        LIMIT toInteger($limit)
        """,
        param_map={"ids": ids, "limit": 30, "embedding": query.query_embedding},
    )
        
        response_sorted = property_graph_store.structured_query(f"""
        // Assign an index to each id in the original list
        WITH [i IN RANGE(0, size($ids)-1) | {{id: $ids[i], seed_idx: i}}] AS id_list
        UNWIND id_list AS id_map
        MATCH (e:`{BASE_ENTITY_LABEL}`)
        WHERE e.id = id_map.id
        WITH e, id_map.id AS id, id_map.seed_idx AS seed_idx  // Include seed_idx

        // For each id, find the top N most relevant relations
        CALL {{
            WITH e
            MATCH p=(e)-[r*1..{path_depth}]-(other)
            WHERE ALL(rel IN relationships(p) WHERE type(rel) <> 'MENTIONS')
            UNWIND relationships(p) AS rel
            WITH DISTINCT rel
            WHERE rel.embedding IS NOT NULL AND size(rel.embedding) = $dimension
            WITH rel, vector.similarity.cosine(rel.embedding, $embedding) AS rel_score
            ORDER BY rel_score DESC
            LIMIT toInteger(7)
            RETURN rel, rel_score
        }}

        WITH DISTINCT rel, rel_score, id AS seed_id, seed_idx  // Include seed_idx
        WITH startNode(rel) AS source,
            type(rel) AS type,
            rel{{.*}} AS rel_properties,
            endNode(rel) AS endNode,
            seed_id,
            seed_idx,  // Include seed_idx
            rel_score

        // Find the best matching text node for the source node
        CALL {{
            WITH source
            OPTIONAL MATCH (source)-[:MENTIONS]-(text_source)
            WHERE text_source.embedding IS NOT NULL AND size(text_source.embedding) = $dimension
            WITH text_source, vector.similarity.cosine(text_source.embedding, $embedding) AS source_score
            ORDER BY source_score DESC
            LIMIT 1
            RETURN text_source.id AS source_best_text_id, source_score AS source_best_score
        }}

        // Find the best matching text node for the end node
        CALL {{
            WITH endNode
            OPTIONAL MATCH (endNode)-[:MENTIONS]-(text_target)
            WHERE text_target.embedding IS NOT NULL AND size(text_target.embedding) = $dimension
            WITH text_target, vector.similarity.cosine(text_target.embedding, $embedding) AS target_score
            ORDER BY target_score DESC
            LIMIT 1
            RETURN text_target.id AS target_best_text_id, target_score AS target_best_score
        }}

        RETURN DISTINCT 
            source.id AS source_id,
            [l IN labels(source) WHERE NOT l IN ['{BASE_ENTITY_LABEL}', '{BASE_NODE_LABEL}'] | l][0] AS source_type,
            source{{.*, embedding: NULL, id: NULL, best_text_id: source_best_text_id, best_text_score: source_best_score }} AS source_properties,
            type,
            rel_properties,
            endNode.id AS target_id,
            [l IN labels(endNode) WHERE NOT l IN ['{BASE_ENTITY_LABEL}', '{BASE_NODE_LABEL}'] | l][0] AS target_type,
            endNode{{.*, embedding: NULL, id: NULL, best_text_id: target_best_text_id, best_text_score: target_best_score }} AS target_properties,
            rel_score,
            seed_id,
            seed_idx  // Include seed_idx

        ORDER BY seed_idx, rel_score DESC  // Order by seed_idx to maintain initial order
        """,
        param_map={
            "ids": ids,
            "limit": 30,
            "embedding": query.query_embedding,
            "dimension": len(query.query_embedding),
        },
    )      

        extracted_response_unsorted = []
        for record in response_unsorted:
            extracted_response_unsorted.append({
                "seed_id": record["seed_id"],
                "source_id": record["source_id"],
                "type": record["type"],
                "target_id": record["target_id"],
                "rel_score": record["rel_score"]
            })
        
        extracted_response_sorted = []
        for record in response_sorted:
            extracted_response_sorted.append({
                "seed_id": record["seed_id"],
                "source_id": record["source_id"],
                "type": record["type"],
                "target_id": record["target_id"],
                "rel_score": record["rel_score"]
            })
        print(extracted_response_unsorted)
        # if directory does not exist make it
        if not os.path.exists(f"OIA Nodes/full_unsorted_k4_d{path_depth}"):
            os.makedirs(f"OIA Nodes/full_unsorted_k4_d{path_depth}")
        with open(f"OIA Nodes/full_unsorted_k4_d{path_depth}/{i+1}.txt", 'w') as f:
            f.write(f"{query.query_str}\n")
            # do not print the keys of the dictionary just the values with a  head --[relation]--> tail
            # add the ids to know which ones are the seeds
            for id in ids:
                f.write(f"Seed: {id}\n")

            for item in extracted_response_unsorted:
                    f.write(f"[{item['seed_id']}]: {item['source_id']} --[{item['type']}]-> {item['target_id']} {item['rel_score']}\n")
        
        # the same for the sorted
        if not os.path.exists(f"OIA Nodes/full_sorted_k4_d{path_depth}"):
            os.makedirs(f"OIA Nodes/full_sorted_k4_d{path_depth}")
        with open(f"OIA Nodes/full_sorted_k4_d{path_depth}/{i+1}.txt", 'w') as f:
            f.write(f"{query.query_str}\n")
            # do not print the keys of the dictionary just the values with a  head --[relation]--> tail
            # add the ids to know which ones are the seeds
            for id in ids:
                f.write(f"Seed: {id}\n")

            for item in extracted_response_sorted:
                    f.write(f"[{item['seed_id']}]: {item['source_id']} --[{item['type']}]-> {item['target_id']} {item['rel_score']}\n")
