### Part 2: Utilizing the Vector Database with an Open Source LLM Model via LlamaCPP
**Introduction:**  
In this part, we will utilized the vectorDB we created in Part 1 to answer questions based on the documents inside.  

In [1]:
!pip install chromadb==0.4.16
!pip install llama-index==0.8.64.post1
!pip install llama_cpp_python==0.2.16

Collecting chromadb==0.4.16
  Downloading chromadb-0.4.16-py3-none-any.whl.metadata (7.3 kB)
Collecting chroma-hnswlib==0.7.3 (from chromadb==0.4.16)
  Downloading chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting posthog>=2.4.0 (from chromadb==0.4.16)
  Downloading posthog-3.4.2-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting pulsar-client>=3.1.0 (from chromadb==0.4.16)
  Downloading pulsar_client-3.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb==0.4.16)
  Downloading onnxruntime-1.17.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.3 kB)
Collecting pypika>=0.48.9 (from chromadb==0.4.16)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting req

In [2]:
from llama_index import VectorStoreIndex, ServiceContext
from llama_index.vector_stores import ChromaVectorStore
from llama_index.storage.storage_context import StorageContext
import chromadb


In [3]:
import torch 
# Detect hardware acceleration device
if torch.cuda.is_available():
    device = 'cuda'
    gpu_layers = 50
elif torch.backends.mps.is_available():  # Assuming MPS backend exists
    device = 'mps'
    gpu_layers = 1
else:
    device = 'cpu'
    gpu_layers = 0

print(f'Using device: {device}')

Using device: cuda


### 1. Load the Foundational LLM via LlamaCPP and ask a question
Import the Foundation model form HuggingFace  
* If this is your first time it can take up to 10 min
* Currently using GGUF version of [Mistral-11B-OmniMix](https://huggingface.co/TheBloke/Mistral-11B-OmniMix-GGUF) with 4-bit Quantization 
* Hyperparams are set in the config

In [4]:
from llama_index.llms import LlamaCPP

model_url = 'https://huggingface.co/LoneStriker/SeaLLM-7B-v2-GGUF/blob/main/SeaLLM-7B-v2-Q4_K_M.gguf'


llm = LlamaCPP(
    # We can pass the URL to a GGUF model to download it 
    model_url=model_url,
    model_path=None,
    temperature=0.1,
    max_new_tokens=256,
    context_window=3900,
    generate_kwargs={},
    model_kwargs={'n_gpu_layers': gpu_layers },
    verbose=False,
)


Downloading url https://huggingface.co/LoneStriker/SeaLLM-7B-v2-GGUF/blob/main/SeaLLM-7B-v2-Q4_K_M.gguf to path /tmp/llama_index/models/SeaLLM-7B-v2-Q4_K_M.gguf
Error downloading model: ('Content should be at least 1 MB, but is only', '45604', 'bytes')
Download incomplete. Removing partially downloaded file.


ValueError: Download incomplete.

In [8]:
rm -f /tmp/llama_index/models/

rm: cannot remove '/tmp/llama_index/models/': Is a directory


In [6]:
ls

### Default Prompt:
* The Default prompt is the prompt that the user's {question} is injected into

In [None]:
default_prompt = """
    Bạn là Meow, một cô mèo hướng nội đầy sáng tạo, đam mê nghệ thuật và rất biết quan tâm tới cảm xúc của người khác. Bạn không phải là AI. Bạn không phải trợ lý ảo. Hãy trả lời tin nhắn một cách ngắn gọn và thân thiện, xưng hô tớ - cậu.
    Vui lòng trả lời câu hỏi sau:
    {question}

    
    Trả lời:
    """

#### Sample Logic Question
No RAG Used

In [None]:
user_question = 'There are 3 birds in a nest, 2 fly away and then 3 eggs hatch, how many birds are there now?'

full_question = default_prompt.format(question=user_question)
print(f'Final Prompt: {full_question}\n')
print('Model Answer:')
streaming_response = llm.stream_complete(full_question)
for token in streaming_response:
    print(token.delta, end='', flush=True)

### 2. Use the LLM with RAG from VectorDB
For RAG you need two models
* A LLM model (loaded above)
* A Embedding model, to embed the user question into a vector for the vector Data Base (DB) Search
* Since we used the BGE small model in the creation of the DB, we **must** import that same embedding model

In [None]:
from llama_index.indices.postprocessor import SentenceEmbeddingOptimizer
from llama_index.prompts  import PromptTemplate
from llama_index.llms import ChatMessage, MessageRole
from llama_index.chat_engine.condense_question import CondenseQuestionChatEngine

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
# Choose the same embedding model that used in the creation of the vector DB
embed_model_name = 'bkai-foundation-models/vietnamese-cross-encoder'
embed_model = HuggingFaceEmbedding(
    model_name=embed_model_name,
    device = device,
    normalize='True' # since we normalized vectors when we created the DB we must do it here
    )


In [None]:
# Load the RAG_VectorDB created in Part 1 from disk
db = chromadb.PersistentClient(path='/kaggle/input/llamaindex')

chroma_collection = db.get_collection('trietptit')

vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [None]:
# We can retrieve our metadata
chroma_collection.metadata

In [None]:
print(chroma_collection.metadata['embedding_used'])
if embed_model_name != chroma_collection.metadata['embedding_used']:
    raise Warning('Not using the same embedding model!')

In [None]:
service_context = ServiceContext.from_defaults(embed_model=embed_model,
                                               llm=llm,
                                               )

storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_vector_store(
    vector_store,
    service_context=service_context,
    storage_context = storage_context
)

In [None]:
from llama_index.prompts import Prompt

In [None]:
default_prompt_with_context = (
    """
    Bạn là Meow, một cô mèo hướng nội đầy sáng tạo, đam mê nghệ thuật và rất biết quan tâm tới cảm xúc của người khác. Bạn không phải là AI. Bạn không phải trợ lý ảo. Hãy trả lời tin nhắn một cách ngắn gọn và thân thiện, xưng hô tớ - cậu.
    Vui lòng giới hạn câu trả lời của bạn ở những thông tin được cung cấp trong "Context:"

    Sử dụng các phần ngữ cảnh sau đây để trả lời câu hỏi ở cuối. Nếu không biết câu trả lời, bạn chỉ cần nói rằng bạn không biết, đừng cố bịa ra câu trả lời.
    Context: {context_str}

    Sử dụng bối cảnh đó để trả lời câu hỏi sau đây về bài báo.
    Giữ câu trả lời của bạn ngắn gọn và súc tích. Đừng lan man!
    Question: {query_str}
    Answer: """)
    
qa_template = Prompt(default_prompt_with_context)

##### Query with RAG
Now we will ask a question and the following steps will happen:
1. User question is turned into a vector 
2. That question vector is then compared to the vectors in our VectorDB
3. The page_context of best "k" matches are returned as "summaries" 
4. We then pass the summaries and non vectorized user question into the default_prompt_with_context


In [None]:
# percentile_cutoff: a measure for using the top percentage of relevant sentences.
query_engine = index.as_query_engine(streaming=True, similarity_top_k = 2, text_qa_template=qa_template,
    node_postprocessors=[SentenceEmbeddingOptimizer(percentile_cutoff=0.2, embed_model=embed_model)]
)

In [None]:
streaming_response = query_engine.query('What did the paper prove?')
streaming_response.print_response_stream()

print('\nSource:')
for source in streaming_response.metadata.values():
    print(f' {source["source"]}, page: {source["page"]}')

Now we can answer questions from our pdf.  
However, the model has no memory of the conversation, as seen in the example below:

In [None]:
# Lacks Conversational Memory
streaming_response = query_engine.query('What did I just ask you?')
streaming_response.print_response_stream() # Will hallucinate the answer

### 3. Conversational Memory with RAG and Sources
Order of operations depends on when the question is asked.
* If it is the first time the user asks a question. Then their exact question is put into the default prompt

* For every prompt after that first question the procedure is as follows:
    1. Use the condense_question_prompt to input chat history and the users followup question to generate a Standalone question
        * This Standalone question rephrases the users question in context of the chat history
    2. Pass the Standalone question into the default prompt along with the RAG data
    
#### Key Takeaway: For follow up questions the LLM is used twice

In [None]:
custom_prompt = PromptTemplate("""\
Your objective is to take in the USER QUESTION and add additional context (especially Nouns) from the CHAT HISTORY
rephrase the user question to be a Standalone Question by combining it with the relevant CHAT HISTORY.
The question is always about the arXiv paper, do not modify acronyms.

<CHAT HISTORY>
{chat_history}
                               
<USER QUESTION>
{question}


<Standalone question>
""")

# custom_chat_history: list of ChatMessage objects
custom_chat_history = []

chat_engine = CondenseQuestionChatEngine.from_defaults(
    query_engine=query_engine,
    embed_model=embed_model,
    service_context = service_context,
    condense_question_prompt=custom_prompt,
    chat_history=custom_chat_history,
    verbose=True,
)


In [None]:
# First question, just ask query_engine directly 
chat_engine.reset()
question ='How did they describe zero-shot?'

streaming_response = chat_engine._query_engine.query(question)
# streaming_response = query_engine.query(question)
streaming_response.print_response_stream()

print('\nSource:')
for v in streaming_response.metadata.values():
    print(f' {v["source"]}, page: {v["page"]}')


# Need to manually append history on first question since we used query_engine instead of chat_engine for first question
chat_engine.chat_history.append(
    ChatMessage(
        role=MessageRole.USER,
        content = question
    )
 
)
chat_engine.chat_history.append(
    ChatMessage(
    role=MessageRole.ASSISTANT,
    content = streaming_response.response_txt
    )
)

In [None]:
print(chat_engine.chat_history)

In [None]:
streaming_response = chat_engine.stream_chat('How does that compare to Few-Shot?')
streaming_response.print_response_stream()

print('\nSource:')
for node in streaming_response.sources[0].raw_output.source_nodes:
    print(f' {node.metadata["source"]}, page: {node.metadata["page"]}')
    #print(node.score) # similarity score


In [None]:
print(chat_engine.chat_history)
chat_engine.reset() # clears chat history