## Installing Libraries


In [None]:
!pip install llama-index llama-parse python-dotenv

In [None]:
from llama_parse import LlamaParse
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
import os
import nest_asyncio
from dotenv import load_dotenv
from IPython.display import Markdown, display
from google.colab import userdata


nest_asyncio.apply()

In [None]:
os.environ["llamaparse_api"] = userdata.get("LLAMA_CLOUD_API_KEY")
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_APIKEY")

## Setting up LlamaParser

In [None]:
parser = LlamaParse(
    api_key=llamaparse_api,
    result_type="markdown"  # "markdown" and "text" are available
)

## Using DirectoryReader with Llamaparse and without Llamaparse

In [None]:
file_extractor = {".pdf": parser}
documents1 = SimpleDirectoryReader(input_files=['/content/data/cellbiology.pdf'], file_extractor=file_extractor).load_data()
print(documents1)

Started parsing the file under job_id 16d9ef9e-7e79-4210-bc9d-d153c8e3fa60
...............[Document(id_='b5fd9c1b-75ca-4294-82a0-af86c07b4e81', embedding=None, metadata={'file_path': '/content/data/cellbiology.pdf', 'file_name': 'cellbiology.pdf', 'file_type': 'application/pdf', 'file_size': 11438144, 'creation_date': '2024-05-27', 'last_modified_date': '2024-05-27'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='CELL BIOLOGY\n         SECOND EDITION\n---\n# CELL BIOLOGY\n\nA Short Course\n\nSECOND EDITION\n\nStephen R. Bolsover\n\nDepartment of Physiology\n\nUniversity College London\n\nJeremy S. Hyams\n\nDepartment of Biology\n\nUniversity College London\n\nElizabeth A. Shephard\n\nDepartment of Biochemistry and Molecular Biology\n\nUniversity Colleg

In [None]:
documents2 = SimpleDirectoryReader(input_files=['/content/data/cellbiology.pdf']).load_data()
print(documents2)

[Document(id_='937a36df-22e6-45e0-ae56-1b7d6d2fb46f', embedding=None, metadata={'page_label': '1', 'file_name': 'cellbiology.pdf', 'file_path': '/content/data/cellbiology.pdf', 'file_type': 'application/pdf', 'file_size': 11438144, 'creation_date': '2024-05-27', 'last_modified_date': '2024-05-27'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='P1: GDZ\nWY001-Bolsover-FM WY001-Bolsover-v3.cls October 22, 2003 14:59\nCELL BIOLOGY\nSECOND EDITION\ni', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), Document(id_='5ed8779d-1998-43a8-bd2e-c53f3428defe', embedding=None, metadata={'page_label': '2', 'file_name': 'cellbiology.pdf', 'file_path': '/content/data/cell

## Index Creation and Query Engine Creation

In [None]:
index1 = VectorStoreIndex.from_documents(documents1)
query_engine1 = index1.as_query_engine()

In [None]:
index2 = VectorStoreIndex.from_documents(documents2)
query_engine2 = index2.as_query_engine()

## Running the Query

In [None]:
query1 = "What is the equation for the overall reaction catalysed by the electron transport chain?"
response = query_engine1.query(query1)
display(Markdown(f"<b>{response}</b>"))

<b>The overall reaction catalyzed by the electron transport chain can be summarized as:

NADH + Q + 5H+ → NAD+ + QH2 + 4H+ matrix intermem</b>

In [None]:
query2 = "What is the equation for the overall reaction catalysed by the electron transport chain?"
response = query_engine2.query(query2)
display(Markdown(f"<b>{response}</b>"))

<b>NADH + H+ + 1/2O2 → NAD+ + H2O</b>