## Load PDF files and split that into chunks

In [1]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

In [2]:
documents = SimpleDirectoryReader("../local-model/sample-pdf").load_data()

In [3]:
# documents

In [4]:
from llama_index.core.node_parser import SentenceSplitter

In [5]:
text_splitter = SentenceSplitter(chunk_size=200, chunk_overlap=50)
nodes = text_splitter.get_nodes_from_documents(documents=documents)

In [6]:
# nodes

In [7]:
print(len(documents), 'pages')
print(len(nodes), 'nodes')

5 pages
18 nodes


## Create Chroma as vector db and load Llama3-8B as embedding

In [8]:
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import chromadb

In [9]:
# create client and a new collection
chroma_client = chromadb.EphemeralClient()
chroma_collection = chroma_client.create_collection("percobaan")

# define embedding function
embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

# set up ChromaVectorStore and load in data
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [10]:
# index = VectorStoreIndex.from_documents(
#     documents, storage_context=storage_context, embed_model=ollama_embedding
# )

In [11]:
index = VectorStoreIndex(nodes=nodes, storage_context=storage_context, embed_model=embed_model)

## Create retriever and load Llama3-8B as LLM

In [12]:
retriever = index.as_retriever()

In [13]:
retriever.retrieve("What's the website that provide NTU Academic Integrity Guidelines?")

[NodeWithScore(node=TextNode(id_='0e72dd7a-aab1-4744-acbf-d136930b33e1', embedding=None, metadata={'page_label': '2', 'file_name': '12_hl2090.pdf', 'file_path': 'c:\\Users\\lebnovo\\llamaindex\\..\\local-model\\sample-pdf\\12_hl2090.pdf', 'file_type': 'application/pdf', 'file_size': 211936, 'creation_date': '2024-07-07', 'last_modified_date': '2024-07-07'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='9106a0cd-d8c3-49ce-a8b9-d6f87d39d9e3', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '2', 'file_name': '12_hl2090.pdf', 'file_path': 'c:\\Users\\lebnovo\\llamaindex\\..\\local-model\\sample-pdf\\12_hl2090.pdf', 'file_type': 'application/pdf', 'file_size': 211936, 'creation_date': '2024-07-07

In [14]:
from llama_index.llms.ollama import Ollama

In [15]:
llm = Ollama(model="llama3:latest", request_timeout=60.0)
query_engine = index.as_query_engine(llm=llm)
response = query_engine.query("What's the website that provide NTU Academic Integrity Guidelines?")
print(response)

http://www.ntu.edu.sg/ai/Pages/academic-integrity-policy.aspx


## Modify the prompt template

In [16]:
prompts_dict = query_engine.get_prompts()
prompts_dict

{'response_synthesizer:text_qa_template': SelectorPromptTemplate(metadata={'prompt_type': <PromptType.QUESTION_ANSWER: 'text_qa'>}, template_vars=['context_str', 'query_str'], kwargs={}, output_parser=None, template_var_mappings={}, function_mappings={}, default_template=PromptTemplate(metadata={'prompt_type': <PromptType.QUESTION_ANSWER: 'text_qa'>}, template_vars=['context_str', 'query_str'], kwargs={}, output_parser=None, template_var_mappings=None, function_mappings=None, template='Context information is below.\n---------------------\n{context_str}\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: {query_str}\nAnswer: '), conditionals=[(<function is_chat_model at 0x00000264190800E0>, ChatPromptTemplate(metadata={'prompt_type': <PromptType.CUSTOM: 'custom'>}, template_vars=['context_str', 'query_str'], kwargs={}, output_parser=None, template_var_mappings=None, function_mappings=None, message_templates=[ChatMessage(role=<MessageR

In [17]:
from llama_index.core import PromptTemplate

template = (
    """
    Answer the question based on the context below. If you can't 
    answer the question, reply "I don't know".

    Context: {context_str}

    Question: {query_str}
    """
)
qa_template = PromptTemplate(template)

In [18]:
query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": qa_template}
)

In [19]:
prompts_dict = query_engine.get_prompts()
prompts_dict

{'response_synthesizer:text_qa_template': PromptTemplate(metadata={'prompt_type': <PromptType.CUSTOM: 'custom'>}, template_vars=['context_str', 'query_str'], kwargs={}, output_parser=None, template_var_mappings=None, function_mappings=None, template='\n    Answer the question based on the context below. If you can\'t \n    answer the question, reply "I don\'t know".\n\n    Context: {context_str}\n\n    Question: {query_str}\n    '),
 'response_synthesizer:refine_template': SelectorPromptTemplate(metadata={'prompt_type': <PromptType.REFINE: 'refine'>}, template_vars=['query_str', 'existing_answer', 'context_msg'], kwargs={}, output_parser=None, template_var_mappings={}, function_mappings={}, default_template=PromptTemplate(metadata={'prompt_type': <PromptType.REFINE: 'refine'>}, template_vars=['query_str', 'existing_answer', 'context_msg'], kwargs={}, output_parser=None, template_var_mappings=None, function_mappings=None, template="The original query is as follows: {query_str}\nWe have 

## Ask the RAG system

In [20]:
response = query_engine.query("What's the website that provide NTU Academic Integrity Guidelines?")
print(response)

The answer is: http://www.ntu.edu.sg/ai/Pages/academic-integrity-policy.aspx


In [21]:
questions = [
    "What is the main focus of the course HL 2090 Special Topic in Literature, I: Literature and Economics?",
    "What's the website that provide NTU Academic Integrity Guidelines?",
    "What if you wish to use the materials for your assignments?",
    "Can you list the books mentioned in the course?",
    "Can you provide the English Language requirement?",
    "What is the minimum acceptable score for the TOEFL iBT test?",
    "How long is the validity of the MUET score?",
]

for question in questions:
    print(f"Question: {question}")
    response = query_engine.query(question)
    print(f"Answer: {response}")
    print()

Question: What is the main focus of the course HL 2090 Special Topic in Literature, I: Literature and Economics?
Answer: The main focus of the course HL 2090 Special Topic in Literature, I: Literature and Economics is an introduction to economic concepts and ideas through their dramatization in literature.

Question: What's the website that provide NTU Academic Integrity Guidelines?
Answer: http://www.ntu.edu.sg/ai/Pages/academic-integrity-policy.aspx

Question: What if you wish to use the materials for your assignments?
Answer: If you wish to use the materials for your assignments, you must cite them accordingly.

Question: Can you list the books mentioned in the course?
Answer: Based on the context provided, here is the list of books mentioned in the course:

1. "The Way to Wealth" by Benjamin Franklin
2. "Bartleby the Scrivener" by Herman Melville
3. "The Great Gatsby" by F. Scott Fitzgerald
4. "Crazy Rich Asians" by Kevin Kwan
5. "Manhattan Transfer" by John Dos Passos
6. "McTeague