## Source Articles

-https://medium.com/@sauravjoshi23/complex-query-resolution-through-llamaindex-utilizing-recursive-retrieval-document-agents-and-sub-d4861ecd54e6

-https://medium.com/@sauravjoshi23/building-knowledge-graphs-rebel-llamaindex-and-rebel-llamaindex-8769cf800115#:~:text=language%20for%20NebulaGraph.-,Relation%20Extraction%20By%20End%2Dto%2Dend%20Language%20generation%20(REBEL,filtered%20with%20a%20RoBERTa%20model.


In [None]:
#load environment variables from .env file
%load_ext dotenv
%dotenv

In [None]:
import os
import torch
import psycopg2
from decouple import config
from sqlalchemy.engine import make_url
from transformers import pipeline
from llama_index import (
    VectorStoreIndex,
    SummaryIndex,
    KnowledgeGraphIndex,
    SimpleKeywordTableIndex,
    SimpleDirectoryReader,
    ServiceContext,
    StorageContext
)
from dotenv import load_dotenv
from llama_index.schema import IndexNode
from llama_index.tools import QueryEngineTool, ToolMetadata
from llama_index.llms import OpenAI
from llama_index.query_engine import SubQuestionQueryEngine
from llama_index.retrievers import RecursiveRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.response_synthesizers import get_response_synthesizer
from llama_index.vector_stores import PGVectorStore
from llama_index.graph_stores import Neo4jGraphStore

In [None]:
#load documents
titles = [
    "DeloitteFutureOfAI"
    ]

documents = {}
for title in titles:
    documents[title] = SimpleDirectoryReader(input_files=[f"data/{title}.pdf"]).load_data()
print(f"loaded documents with {len(documents)} documents")

#load llm
OPENAI_API_KEY = config('OPENAI_API_KEY')
llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
service_context = ServiceContext.from_defaults(llm=llm)

In [None]:
#initialize vectorstore config
connection_string = config('PGVECTOR_CONNECTION_STRING')
db_name = config('PGVECTOR_DATABASE')
conn = psycopg2.connect(connection_string)
conn.autocommit = True


# construct vector store and customize storage context
url = make_url(connection_string)

vector_storage_context = StorageContext.from_defaults(
    vector_store = PGVectorStore.from_params(
        database=db_name,
        host=url.host,
        password=url.password,
        port=url.port,
        user=url.username,
        table_name="DeloitteFutureOfAI",
        embed_dim=1536,  # openai embedding dimension
    )
)



In [None]:
llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
service_context = ServiceContext.from_defaults(llm=llm)

In [None]:
from llama_index.agent import OpenAIAgent

# Build agents dictionary
agents = {}

for title in titles:
    # build vector index
    vector_index = VectorStoreIndex.from_documents(
        documents[title], service_context=service_context
    )
    # build summary index
    summary_index = SummaryIndex.from_documents(
        documents[title], service_context=service_context
    )
    # define query engines
    vector_query_engine = vector_index.as_query_engine()
    list_query_engine = summary_index.as_query_engine()

    # define tools
    query_engine_tools = [
        QueryEngineTool(
            query_engine=vector_query_engine,
            metadata=ToolMetadata(
                name="vector_tool",
                description=(
                    "Useful for summarization questions related to"
                    f" {title}"
                ),
            ),
        ),
        QueryEngineTool(
            query_engine=list_query_engine,
            metadata=ToolMetadata(
                name="summary_tool",
                description=(
                    f"Useful for retrieving specific context from {title}"
                ),
            ),
        ),
    ]

    # build agent
    function_llm = OpenAI(model="gpt-3.5-turbo-0613")
    agent = OpenAIAgent.from_tools(
        query_engine_tools,
        llm=function_llm,
        verbose=True,
    )

    agents[title] = agent

In [None]:
# define top-level nodes
nodes = []
for title in titles:
    # define index node that links to these agents
    title_summary = (
        f"This content contains Wikipedia articles about {title}. Use"
        " this index if you need to lookup specific facts about"
        f" {title}.\nDo not use this index if you want to analyze"
        " multiple cities."
    )
    node = IndexNode(text=title_summary, index_id=title)
    nodes.append(node)

In [None]:
# define top-level retriever
vector_index = VectorStoreIndex(nodes)
vector_retriever = vector_index.as_retriever(similarity_top_k=1)

# define recursive retriever
from llama_index.retrievers import RecursiveRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.response_synthesizers import get_response_synthesizer

# note: can pass `agents` dict as `query_engine_dict` since every agent can be used as a query engine
recursive_retriever = RecursiveRetriever(
    "vector",
    retriever_dict={"vector": vector_retriever},
    query_engine_dict=agents,
    verbose=True,
)

In [None]:
response_synthesizer = get_response_synthesizer(
    # service_context=service_context,
    response_mode="compact",
)
query_engine = RetrieverQueryEngine.from_args(
    recursive_retriever,
    response_synthesizer=response_synthesizer,
    service_context=service_context,
)

In [None]:
# either way we can now query the index
response = query_engine.query("What can we liken to C-3PO and Chewbacca?")
print(response)