In [1]:
import os

import nest_asyncio
nest_asyncio.apply()

from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader, Document


In [2]:
LLAMAPARSE_API_KEY = os.environ.get('LLAMAPARSE_API_KEY')
if LLAMAPARSE_API_KEY is not None:
    print('API key found')
else:
    print('Check for API key in environment variable')

API key found


In [3]:
# instantiate parser
parser = LlamaParse(
    api_key=LLAMAPARSE_API_KEY,
    result_type="markdown", # or text
    # num_workers=4 # for multiple files
    verbose=True,
    language="en", # default is english
)

In [4]:
# load document and parse it 
# documents = parser.load_data('../data/axis-press-release-q3fy24.pdf')

In [5]:
file_extractor = {".pdf": parser}
filename_fn = lambda filename: {"file_name": filename}
reader = SimpleDirectoryReader(
    input_files=['../data/axis-press-release-q3fy24.pdf'], 
    file_extractor=file_extractor,
    filename_as_id=True,
    file_metadata = filename_fn,
    )
documents = reader.load_data()

Started parsing the file under job_id 92789fec-dd48-4fc3-8d54-e7a6244f663d


In [None]:
for doc in documents:
    doc.metadata['file_descr'] = 'Axis bank quarterly earnings report for quarter ended December 2023'

In [6]:
# document = Document(
#     documents,
#     metadata={"filename": "axis-press-release-q3fy24",
#               "category":"press release",
#               "quarter":"q3",
#               "financial_year":"fy24",
#               },
# )

In [7]:
from llama_index.llms.openai import OpenAI
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import MarkdownElementNodeParser

In [None]:
llm = OpenAI(model='gpt-3.5-turbo-0125', temperature=0)

In [15]:
node_parser = MarkdownElementNodeParser(llm=llm)
nodes=node_parser.get_nodes_from_documents(documents)

Embeddings have been explicitly disabled. Using MockEmbedding.


5it [00:00, 1957.76it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:07<00:00,  1.45s/it]


In [16]:
base_nodes, objects = node_parser.get_nodes_and_objects(nodes)

In [18]:
# split into nodes and create an index from parsed markdown
index = VectorStoreIndex(nodes=base_nodes+objects)

# create query engine
query_engine = index.as_query_engine()

In [19]:
query = "what is axis bank's RoA"

resp = query_engine.query(query)
print(resp)

1.84%
