In [14]:
import openai
from dotenv import load_dotenv
import os
# from Ipython.display import Markdown, display

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [15]:
from llama_index.llms import OpenAI

from llama_index import (
    VectorStoreIndex,
    get_response_synthesizer,
    ServiceContext,
    StorageContext,
    load_index_from_storage
)

from llama_index.text_splitter import SentenceSplitter
from llama_index import SimpleDirectoryReader
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.postprocessor import SimilarityPostprocessor


## 1. Semantic Search

#### Loading

In [2]:
documents = SimpleDirectoryReader("./src_docs/current_subset").load_data()

#### Indexing and Storing:

Once stored, there is no need to index them again as this will result in openai api calls to converting documents into embeddings.

In [30]:
#high level api
# service_context = ServiceContext.from_defaults(chunk_size=512, chunk_overlap=10)
# index = VectorStoreIndex.from_documents(
#     documents,
#     service_context=service_context
# )


# [ or ]

#low level api: customize the parser
# parser = SentenceSplitter(chunk_size=512, chunk_overlap=10)
# service_context = ServiceContext.from_defaults(text_splitter=parser)

# index = VectorStoreIndex.from_documents(
#               documents, 
#               service_context=service_context, 
#               show_progress=True
#)

# index.set_index_id("research_papers_vector_index")
# index.storage_context.persist("./storage")


# [ or ]

#further low level api: customize the parser
# parser = SentenceSplitter(chunk_size=512, chunk_overlap=10)
# nodes = parser.get_nodes_from_documents(documents)
# index = VectorStoreIdex(nodes)
# service_context = ServiceContext.from_defaults(text_splitter=parser)
# index.set_index_id("research_papers_vector_index")
# index.storage_context.persist("./storage")

#### Reload index from local storage

In [34]:
storage_context = StorageContext.from_defaults(persist_dir="storage")
index = load_index_from_storage(
    storage_context, 
    index_id="research_papers_vector_index"
)

In [35]:
#configure a Retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=10,
)

In [36]:
#configure response synthesis
response_synthesizer = get_response_synthesizer()

In [37]:
#assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
)

In [38]:
response = query_engine.query("Can you give me an overview of graph of thoughts in an easy to understand manner?")

In [41]:
print(response.response)

The Graph of Thoughts (GoT) is a framework that enhances the capabilities of large language models (LLMs) by representing the information generated by an LLM as a graph. In this graph, each unit of information, known as an "LLM thought," is represented as a vertex, and the relationships between these thoughts are represented as edges.

GoT allows for the combination of different LLM thoughts, enabling the LLM to solve complex problems by leveraging the connections between thoughts. It supports various thought transformations, including aggregation, refining, and generation.

Aggregation involves combining multiple thoughts into a new thought, allowing for the synthesis of whole networks of thoughts. Refining allows for the modification of a current thought, while generation involves creating new thoughts based on existing ones.

Thoughts in GoT can be scored and ranked to evaluate their quality and relevance. This helps in assessing the effectiveness of the current solution and selecti

In [42]:
tot_query_engine = index.as_query_engine(response_mode="tree_summarize")

In [43]:
response = tot_query_engine.query(
    "What are the key concepts involved in understanding graph of thoughts. Can you summarize and explain each of them shortly ?"
)

In [44]:
print(response.response)

The key concepts involved in understanding the Graph of Thoughts (GoT) are as follows:

1. Graph of Thoughts (GoT): GoT is an approach that enhances the capabilities of Language Model Machines (LLMs) through networked reasoning. It models LLM thoughts as vertices and dependencies between thoughts as edges in a directed graph. GoT allows for the aggregation of arbitrary thoughts by constructing vertices with multiple incoming edges, enabling more complex thought patterns.

2. Thought Transformations: GoT enables various thought transformations, such as aggregating thoughts into a new one, refining thoughts through looping, and backtracking from a chain of thoughts. These transformations enhance the flexibility and power of LLMs in generating solutions.

3. Evaluator Function (E) and Ranking Function (R): In the GoT framework, an evaluator function (E) is used to assign scores to thoughts, evaluating their relevance or quality. A ranking function (R) is then employed to select the most r

- maximum marginal relevance
- custom embedding string

In [47]:
response.get_formatted_sources()

'> Source (Doc id: f7b7f7b3-2285-4c36-aa96-5635a53d194f): For example, one could explore a certain chain of reason-\ning, backtrack and start a new one, the...\n\n> Source (Doc id: b42f9f15-951b-4e28-b1aa-d367287e811b): Input\nOutputInput\nOutput OutputThoughts:\nUnscored\nNegative\nscore OutputInput\nOutput[This work]\nIn...'

#### query with filters

In [13]:
# from llama_index.vector_stores.types import ExactMatchFiler, MetadataFilters

In [None]:
filters = MetadataFilters(
    filters=[
        ExactMatchFiler(key="tag")
    ]
)

retriever = index.as_retriever(
    similarity_top_k=20,
    filters=filters,
)

---

## 2. Metadata Extraction

Disambiguate similar looking passages by extracting important keywords from the passage.

In [11]:
from llama_index.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
    KeywordExtractor,
    EntityExtractor
)
from llama_index.ingestion import IngestionPipeline

In [7]:
transformations = [
    SentenceSplitter(),
    TitleExtractor(nodes=5),
    QuestionsAnsweredExtractor(questions=3),
    SummaryExtractor(summaries=["prev", "self"]),
    KeywordExtractor(keywords=10),
    EntityExtractor(prediction_threshold=0.5)
]
pileline = IngestionPipeline(transformations=transformations)

  from .autonotebook import tqdm as notebook_tqdm
config.json: 100%|██████████| 5.08k/5.08k [00:00<00:00, 897kB/s]
model.safetensors: 100%|██████████| 712M/712M [00:11<00:00, 63.4MB/s] 
config.json: 100%|██████████| 625/625 [00:00<00:00, 89.0kB/s]
tokenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<00:00, 5.90kB/s]
vocab.txt: 100%|██████████| 996k/996k [00:00<00:00, 20.1MB/s]
tokenizer.json: 100%|██████████| 1.96M/1.96M [00:00<00:00, 16.7MB/s]


In [10]:
# this will make many llm calls

# nodes = pileline.run(documents=documents)

#### custom extractor

In [9]:
from llama_index.extractors import BaseExtractor

In [None]:
# In a more advanced example, it can also make use of an llm to extract features from the node content and the existing metadata. Refer to the source code of the provided metadata extractors for more details.

class CustomExtractor(BaseExtractor):
    def extract(self, nodes):
        metadata_list = [
            {
                "custom":node.metadata["document_title"]
                + "\n"
                + node.metadata["excerpt_keywords"]
            } for node in nodes
        ]
        return metadata_list 

-----

## 3. Document Management

In [8]:
from llama_index import SummaryIndex, Document

In [12]:
index = SummaryIndex([])
text_chunks = ["The cat's name is Cash", "I love quantum mechnics", "I want to buy a tesla"]
doc_chunks = []
for i, text in enumerate(text_chunks):
    doc = Document(text=text, id=f"doc_id_{i}")
    doc_chunks.append(doc)
#insert
for doc_chunk in doc_chunks:
    index.insert(doc_chunk)

In [16]:
#delete
index.delete_ref_doc("The cat's name is Cash", delete_from_docstore=True)

In [17]:
doc_chunks[0].text

"The cat's name is Cash"

In [18]:
#update

# Here, we passed some extra kwargs to ensure the document is deleted from the docstore. This is of course optional.
doc_chunks[0].text = "Who is Fred Flinstones?"
index.update_ref_doc(
    doc_chunks[0],
    update_kwargs={"delete_kwargs":{"delete_from_docstore":True}}
)

In [19]:
#refresh: (use case: constantly updating new information such as a directory, the working context memory, etc)

# modify first document, with the same doc_id
doc_chunks[0] = Document(text="Andrew Huberman podcast is amazing!", id_="doc_id_0")

#add a new document
doc_chunks.append(
    Document(
        text="this isn't in the index yet, but it will soon be",
        id_="doc_id_2"
    )
)

refreshed_docs = index.refresh_ref_docs(
    doc_chunks, 
    update_kwargs={"delete_kwargs": {"delete_from_docstpre": True}}
)
refreshed_docs

[True, False, False, True]

In [20]:
index.ref_doc_info

{'cd602e96-f474-415e-8b6f-f5a4e0de2ee2': RefDocInfo(node_ids=['4b76d51f-02e5-4a20-8321-8707781392dd'], metadata={}),
 'cdd3620c-63f3-415a-b478-96cb47cd2404': RefDocInfo(node_ids=['3cbc1dd9-4310-4b39-bcc1-cff65261a6e2'], metadata={}),
 '9f28c6bb-dbf6-4d7c-8360-f2ee43442af2': RefDocInfo(node_ids=['0934edfa-4f20-4351-a174-ea47f6ba5f71'], metadata={}),
 'doc_id_0': RefDocInfo(node_ids=['f2b83001-4551-4b3b-8263-e357a99a4bb7'], metadata={}),
 'doc_id_2': RefDocInfo(node_ids=['2a42784c-5f21-4d95-ad64-486fa328085a'], metadata={})}

----

## 4. Customizing Storage

In [21]:
index = VectorStoreIndex.from_documents(documents)

In [22]:
index.ref_doc_info

{'9ca2a1d0-3497-481b-bce2-e039f68cffd2': RefDocInfo(node_ids=['488f81f3-4839-4375-94cb-83bf87e180ad', '76cbc7da-ebd5-4562-84a4-921c8e3aa2b4'], metadata={'page_label': '1', 'file_name': 'graph_of_thoughts.pdf', 'file_path': 'src_docs/current_subset/graph_of_thoughts.pdf', 'file_type': 'application/pdf', 'file_size': 2838941, 'creation_date': '2023-11-25', 'last_modified_date': '2023-11-25', 'last_accessed_date': '2023-11-25'}),
 'ef5bc24e-11ab-42d6-b1c5-4db1ff1239cc': RefDocInfo(node_ids=['7a7849cc-1a77-4ec9-9c80-c44ebe80ed77', '17f91e8d-5fcb-407b-95f9-c4146192f5ec'], metadata={'page_label': '2', 'file_name': 'graph_of_thoughts.pdf', 'file_path': 'src_docs/current_subset/graph_of_thoughts.pdf', 'file_type': 'application/pdf', 'file_size': 2838941, 'creation_date': '2023-11-25', 'last_modified_date': '2023-11-25', 'last_accessed_date': '2023-11-25'}),
 '79786b26-060e-4428-85c2-d57843c47428': RefDocInfo(node_ids=['e171e276-5a7c-45d7-adb0-20d7dbb0c9c9', '03ee9972-173b-491b-80f6-5341d812c17

In [7]:
from llama_index.storage.docstore import SimpleDocumentStore
from llama_index.storage.index_store import SimpleIndexStore
from llama_index.vector_stores import SimpleVectorStore
from llama_index.node_parser import SentenceSplitter

In [27]:
#create a parser and parse the documents into nodes
parser = SentenceSplitter()
nodes = parser.get_nodes_from_documents(documents)

#create a storage context using default stores
storage_context = StorageContext.from_defaults(
    docstore=SimpleDocumentStore(),
    vector_store=SimpleVectorStore(),
    index_store=SimpleIndexStore()
)

#create or load docstore and add nodes
storage_context.docstore.add_documents(nodes)

#build index -comment out after creating index
index = VectorStoreIndex(nodes, storage_context=storage_context)

#set index_id and save index -comment out after creating index
index.set_index_id("graph_of_thoughts")
index.storage_context.persist(persist_dir="./research_paper_index")

#load index from memory
from llama_index import load_index_from_storage
storage_context = StorageContext.from_defaults(persist_dir="./research_paper_index")
index = load_index_from_storage(storage_context)

# # if loading an index from a persist_dir containing multiple indexes
# loaded_index = load_index_from_storage(storage_context, index_id="<index_id>")

# # if loading multiple indexes from a persist dir
# loaded_indicies = load_index_from_storage(
#     storage_context, index_ids=["<index_id>", ...]
# )

In [6]:
from llama_index import (
    load_index_from_storage,
    load_indices_from_storage,
    load_graph_from_storage
)
from llama_index import load_index_from_storage

# Index

In [17]:
#load index 
#[changed default__vector_store.json to vector_store.json in ./research_paper_index]

persist_dir = "./research_paper_index"
storage_context = StorageContext.from_defaults(
    docstore=SimpleDocumentStore.from_persist_dir(persist_dir),
    vector_store=SimpleVectorStore.from_persist_dir(persist_dir),
    index_store=SimpleIndexStore.from_persist_dir(persist_dir),
)

index = load_index_from_storage(storage_context, index_id="graph_of_thoughts")

can also load and save index from cloud/remote storaage

- vector stores contain embedding vectors of ingested document chunks (and sometimes the document chunks themselves)
    - chromadb, pinecone, wewiate
- document store contain ingested document chunks (i.e node objects - with embedding, metadata, other data etc.)
    - mongodb, redis
- index stores contain lightweight index metadata (additional state information about index)
    - mongodb, redis
- key-value stores are just key value stores. (not ready yet)
- graph stores store graph data
    - neo4j

-----

## 5. Querying

#### Query Engine

In [30]:
query_engine = index.as_query_engine()
#query_engine.query("")

In [31]:
#high level api
query_engine = index.as_query_engine(
    response_nodes = "tree_summarize",
    verbose=True
)

In [4]:
from llama_index import get_response_synthesizer
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine

In [None]:
#build index
# index = VectorStoreIndex.from_documents(documents)

#configure a retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=2
)

#configure a response synthesizer
response_synthesizer = get_response_synthesizer(
    response_mode="tree_summarize"
)

#assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer
)

#### custom query engine

In [2]:
from llama_index.query_engine import CustomQueryEngine
from llama_index.retrievers import BaseRetriever
from llama_index.response_synthesizers import BaseSynthesizer

In [None]:
class RAGQueryEngine(CustomQueryEngine):
    retriever: BaseRetriever
    response_synthesizer: BaseSynthesizer

    def custom_query(self, query_str: str):
        nodes = self.retriever.retrieve(query_str)
        response_obj = self.response_synthesizer.synthesize(query_str, nodes)
        return response_obj

- all query engines need to be learnt

------

#### Response modes

- refine: separate llm calls for each node to refine answer from each node
    - ans_1 = node_1 + text_qa_template
    - ans_2 = ans_1 + node_2 + refine_template
    - .
    - .
    - ans_n = ans_n-1 + node_n + refine_template


- compact: concatenate as many nodes as context window allows with the text_qa_template or refine_template, then work like refine
- tree_summarize: summary_template + concatenated_node_chunks + T
    - where T = next_layer(summary_template + concatenated_node_chunks + T)


- simple_summarize: quick and lossy
- no_text: dont send data to llm. just select data snd show source with response.source_nodes.
- accumulate: run llm separately on text chunks and then concatinate the answers.
- compact_accumulate: compact + accumulate


-----

### Streaming the output response of the app: 
<b>for reducing the perceived latency</b>

- Chosen llm needs to support streaming.

In [32]:
#high level api
query_engine = index.as_query_engine(streaming=True, similarity_top_k=1)

In [None]:
#low level api

synth = get_response_synthesizer(streaming=True)
query_engine = RetrieverQueryEngine(response_synthesizer=synth)

In [None]:
# streaming_response = query_engine.query("")
for text in streaming_response.response_gen:
    #do something with text as they arrive
    pass

In [None]:
# alternatively print the final text
streaming_response.print_response_stream()

------

### Chat Engine

Keeps track of conversations unlike a query engine

In [33]:
chat_engine = index.as_chat_engine()

# response = chat_engine.chat("ask something to it")

# streaming_response = chat_engine.stream_chat("tell something")
# for token in streaming_response.resopnse_gen:
#     print(token, end="")

In [None]:
chat_engine.reset() # to reset the chat history

In [1]:
# chat_engine.chat_repl() #to enter interactive chat

In [18]:
#high level api
chat_engine = index.as_chat_engine(chat_mode="condense_question", verbose=True)

**chat_mode argument in the above function:**
- best: react data agent or openai data agent
- condense_question: user chat history = query for index
- context: retrieve nodes for every user msg. Retrieved text is inserted into system prompt
- condense_plus_context: condense + context
- simple: no query engine involved.
- react: same as best but a react data agent
- openai: same as best but a openai data agent

In [None]:
#low level composition



-----