<a href="https://colab.research.google.com/github/dhirajsuvarna/advance-rag-examples/blob/main/llama_index_advance_rag_with_llama_parse.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Advance RAG using LlamaParse

In [1]:
! pip install -q llama-index
! pip install -q llama-index-core
! pip install -q llama-index-embeddings-openai
! pip install -q llama-index-postprocessor-flag-embedding-reranker
! pip install -q git+https://github.com/FlagOpen/FlagEmbedding.git
! pip install -q llama-parse

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.4/15.4 MB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.0/108.0 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m262.9/262.9 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.1/286.1 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [2]:
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/10q/uber_10q_march_2022.pdf' -O './uber_10q_march_2022.pdf'

--2024-03-26 12:22:30--  https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/10q/uber_10q_march_2022.pdf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 404 Not Found
2024-03-26 12:22:30 ERROR 404: Not Found.



In [5]:
import nest_asyncio
nest_asyncio.apply()

import os
from google.colab import userdata

os.environ["OPENAI_API_KEY"] = userdata.get("HCL_OPENAI_KEY")
os.environ["LLAMA_CLOUD_API_KEY"] = userdata.get("LLAMA_CLOUD_API_KEY")

In [6]:
# set global setting for embedding model and llm model

from llama_index.core import Settings
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI

embed_model = OpenAIEmbedding(model="text-embedding-3-small")
llm = OpenAI(model="gpt-3.5-turbo-0125")

Settings.embed_model = embed_model
Settings.llm = llm

# Use LLamaParse for PDF Reading

In [None]:
# creating documents
from llama_parse import LlamaParse

documents = LlamaParse(result_type="markdown").load_data("./uber_10q_march_2022.pdf")

Started parsing the file under job_id 43d49d2b-5651-4e95-ac47-8104d1813443
.......................................

In [None]:
import pickle
with open("llama_parse_document.pkl", 'wb') as outFile:
  pickle.dump(documents, outFile)

In [None]:
print(f"Number of Documents: {len(documents)}")

In [None]:
print(documnets[0].text[:1000] + '...')

In [None]:
# generating nodes

from llama_index.core.node_parser import MarkdownElementNodeParser

node_parser = MarkdownElementNodeParser(llm = OpenAI(model="gpt-3.5-turbo-0125"), num_workers=8)

In [None]:
nodes = node_parser.get_nodes_from_documents(documents)

In [None]:
base_nodes, objects = node_parser.get_nodes_from_documents(nodes)

In [None]:
# what are these objects
obj = objects[0]
type(obj)

In [None]:
from llama_index.core import VectorStoreIndex

recursive_retriever_index = VectorStoreIndex(nodes = base_nodes + objects)
raw_index = VectorStoreIndex.from_documents(documents)

In [None]:
# reranker
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker

reranker = FlagEmbeddingReranker(top_n=5, model="BAAI/bge-reranker-large")

In [None]:
recursive_query_engine = recursive_retriever_index.as_query_engine(
    similarity_top_k=15,
    node_postprocessors = [reranker],
    verbose=True,
)

In [None]:
raw_query_engine = raw_index.as_query_engine(similarity_top_k=15, node_postprocessors=[reranker])

In [None]:
# Query

query = "how is cash paid for income taxes, net of refund from Supplemental disclousers of cash flow information?"

response_1 = raw_query_engine.query(query)
print("\n***********New LlamaParse+ Basic Query Engine***********")
print(response_1)

response_2 = recursive_query_engine.query(query)
print("\n***********New LlamaParse+ Recursive Retriever Query Engine***********")
print(response_2)
