# Advanced RAG: Extracting Complex PDFs containing tables & Text Using LlamaParse

https://aksdesai1998.medium.com/advanced-rag-extracting-complex-pdfs-containing-tables-text-using-llamaparse-48b61693da58

I'll use [e23076_uber-ars.pdf](https://www.sec.gov/Archives/edgar/data/1543151/000155278123000195/e23076_uber-ars.pdf)

## Setup environment

In [1]:
from google.colab import userdata
import os

os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
os.environ["LLAMA_CLOUD_API_KEY"] = userdata.get('LLAMA_CLOUD_API_KEY')

In [2]:
!mkdir data

## Q&A with Langchain

In [None]:
!pip install -q langchain-openai langchain-community lancedb "unstructured[pdf]"

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.8/34.8 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from langchain import hub
from langchain_community.document_loaders.directory import DirectoryLoader
from langchain_community.vectorstores import LanceDB
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
# Load documents

loader = DirectoryLoader("/content/data")
documents = loader.load()

print(len(documents))

1


In [None]:
# Split and chunk

splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)

texts: list = splitter.split_documents(documents)

print(len(texts))

10820


In [None]:
# Create Vector Store and Retriever

vectorstore = LanceDB.from_documents(documents=texts, embedding=OpenAIEmbeddings())



In [None]:
# Rag chain

retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

prompt = hub.pull("rlm/rag-prompt")

llm = ChatOpenAI(model="gpt-4o")

chain = (
    {
        "question": RunnablePassthrough(),
        "context": retriever
    }
    | prompt
    | llm
    | StrOutputParser()
)

chain.invoke("how is the Cash paid for Income taxes, net of refunds from Supplemental disclosures of cash flow information?")



"I don't know. The provided context does not contain information about the cash paid for income taxes, net of refunds from supplemental disclosures of cash flow information."

## Q&A on PDF Data Using LlamaIndex

In [17]:
!pip install -q llama-index \
                llama-index-postprocessor-flag-embedding-reranker \
                llama-index-vector-stores-lancedb \
                FlagEmbedding

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.9/163.9 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for FlagEmbedding (setup.py) ... [?25l[?25hdone
[0m

In [36]:
from llama_index.core import (
    SimpleDirectoryReader,
    StorageContext,
    VectorStoreIndex
)
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.postprocessor.flag_embedding_reranker import (
    FlagEmbeddingReranker
)
from llama_index.vector_stores.lancedb import LanceDBVectorStore

In [44]:
reader = SimpleDirectoryReader("data/")

documents = reader.load_data()

print(len(documents), "documents")

node_parser = SimpleNodeParser.from_defaults(
    chunk_size=800,
    chunk_overlap=100
)
nodes = node_parser.get_nodes_from_documents(documents)

print(len(nodes), "nodes")

vector_store = LanceDBVectorStore(uri="/content/lancedb")

storage_context = StorageContext.from_defaults(vector_store=vector_store)

lance_index = VectorStoreIndex(
    nodes=nodes,
    storage_context=storage_context,
    embed_model=OpenAIEmbedding(),
    show_progress=True
)

153 documents
974 nodes


Generating embeddings:   0%|          | 0/974 [00:00<?, ?it/s]

In [45]:
reranker = FlagEmbeddingReranker(top_n=5)

query_engine = lance_index.as_query_engine(
    similarity_top_k=10,
    node_postprocessors=[reranker]
)

In [46]:
%%time

response = query_engine.query("how much is the Cash paid for Income taxes, net of refunds from Supplemental disclosures of cash flow information?")
response.response

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  return forward_call(*args, **kwargs)


CPU times: user 27.2 s, sys: 244 ms, total: 27.5 s
Wall time: 28.2 s


'The Cash paid for Income taxes, net of refunds from Supplemental disclosures of cash flow information is not provided in the context information.'

## LlamaParser with Lamaindex

In [42]:
from llama_parse import LlamaParse
from llama_index.core import (
    SimpleDirectoryReader,
    StorageContext,
    VectorStoreIndex
)
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.lancedb import LanceDBVectorStore

In [52]:
llama_parser = LlamaParse(result_type="markdown")

documents = llama_parser.load_data("/content/data/e23076_uber-ars.pdf")

node_parser = SimpleNodeParser()
nodes = node_parser.get_nodes_from_documents(documents)

vector_store_lance_llamaparse = LanceDBVectorStore(
    uri="/content/lance_llamaparse"
)

storage_context = StorageContext.from_defaults(
    vector_store=vector_store_lance_llamaparse
)

lance_llamaparse_index = VectorStoreIndex(
    nodes=nodes,
    storage_context=storage_context,
    embed_model=OpenAIEmbedding()
)

Started parsing the file under job_id b1509d2f-2e2c-44e4-ba06-54a85742d5aa
.



In [59]:
query_engine = lance_llamaparse_index.as_query_engine(similarity_top_k=15)
response = query_engine.query("how is the Cash paid for Income taxes, net of refunds from Supplemental disclosures of cash flow information?")
response.response

'The Cash paid for Income taxes, net of refunds from Supplemental disclosures of cash flow information is $1 billion for the year ended December 31, 2022.'

In [60]:
response = query_engine.query("how is the Cash paid for Income taxes, net of refunds from Supplemental disclosures of cash flow information in March 31, 2022?")
response.response

'Cash paid for Income taxes, net of refunds from Supplemental disclosures of cash flow information in March 31, 2022 was $5 million.'

The responses are wrong! :(