# Objective: To use gguf model locally with langchain 

    embedding: HuggingFaceEmbedding(sentence-transformers/all-mpnet-base-v2)
    
    vector store: Chroma
    
    retriever: from vector store
    
    llm:
        1)mistral_7B_v0.3
        2)capybarahermes-2.5-mistral-7b.Q3_K_L.gguf

## Document Loader
    pdf loader : langchain inbuilt document loader 

In [None]:
from langchain_community.document_loaders import PyPDFLoader
file_path = (r"D:\OneDrive - Adani\Desktop\LEARNING_FOLDER\_Kolkata_2024\1_LLM\3_Text_query_bot\_docs\Leave_Policy_2024.pdf")
loader = PyPDFLoader(file_path)
pages = loader.load_and_split()
len(pages)


## Split
    smaller chunks

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(pages)
len(splits)


In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

## Create vector store
    stores embeddings of Documents

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

In [None]:
new_embeddings = HuggingFaceEmbeddings(model_name= r"D:\OneDrive - Adani\Desktop\LEARNING_FOLDER\_Kolkata_2024\1_LLM\local_downloaded_models\embedding_models\gte-base")
vectorstore = Chroma.from_documents(documents=splits, embedding=new_embeddings)
vectorstore

## Create retriever

In [None]:
retriever = vectorstore.as_retriever(search_type="similarity")

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

### LLM Used:

## 1)Model: mistral_7B_v0.3
    Langchain + CTransformer
    Size: 4.07 GB 
    RAM usage: how to find??
    Response time:
    
- quantized model with langchain
- on cpu without nvidia gpu
- cpu -- 16 gb RAM
- download the gguf model
- What is quantization --> the higger bit quant --> more ram

In [None]:
from langchain_community.llms import CTransformers
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

model_dir =  r"D:\OneDrive - Adani\Desktop\LEARNING_FOLDER\_Kolkata_2024\1_LLM\local_downloaded_models\mistral_7B_v0.3"
model_file = "Mistral-7B-Instruct-v0.3.Q4_K_M.gguf"

config = {'context_length': 16000, 'max_new_tokens': 1600}


llm = CTransformers(model= model_dir, model_file = model_file, callbacks=[StreamingStdOutCallbackHandler()], config= config)

## 2)Model:capybarahermes-2.5-mistral-7b.Q3_K_L.gguf

    Langchain + CTransformer
    Size: 3.82 GB 
    RAM usage: 6.02 GB
    Response time:


Model Specs:
- context_length: 32768

def=> number of tokens or words that model takes into account when generating a response

- max_new_tokens: 32768 

def => max number of tokens that can be generated. helps to control the length of generated output
    
(+)Note:
- if less max tokens, the processing will be faster


In [None]:
from langchain_community.llms import CTransformers
from langchain import PromptTemplate
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.prompts import PromptTemplate

model_dir =  r"D:\OneDrive - Adani\Desktop\LEARNING_FOLDER\_Kolkata_2024\1_LLM\local_downloaded_models"
model_file = "capybarahermes-2.5-mistral-7b.Q3_K_M.gguf"

config = {'context_length': 16000, 'max_new_tokens': 1600}


llm = CTransformers(model= model_dir, model_file = model_file, callbacks=[StreamingStdOutCallbackHandler()], config= config)

## 3) llama-2-7b-chat.Q6_K.gguf
download link: https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/tree/main
Model specifications

In [None]:
from langchain_community.llms import CTransformers
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

model_dir =  r"D:\OneDrive - Adani\Desktop\LEARNING_FOLDER\_Kolkata_2024\1_LLM\local_downloaded_models"
model_file = "llama-2-7b-chat.Q6_K.gguf"

config = {'context_length': 16000, 'max_new_tokens': 1600}


llm = CTransformers(model= model_dir, model_file = model_file, callbacks=[StreamingStdOutCallbackHandler()], config= config)

In [None]:
llm.invoke("You are a better AI assistant you know")

## Creating Prompt

In [None]:
from langchain.prompts import PromptTemplate

### New Terms:
- Multiqueryretriever: 
    - automates process of tuning
    - to generate multiple queries from different perspective 
    - for each query- returns relevant documents,, takes union across all 
    - Overcomes the limitation of distance based retrieval


In [None]:
from langchain.prompts import ChatPromptTemplate

## Exploring Retrievers

In [None]:
# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [None]:
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [None]:
retriever = vectorstore.as_retriever()
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

## Memory:
1) ConversationBufferMemory

In [None]:
from langchain.memory import ConversationBufferMemory

In [None]:
memory = ConversationBufferMemory(
    memory_key= "chat_history",
    return_messages= True
)

In [None]:
memory

## Chain

In [None]:
## For Memory LCEL may not work
from langchain.chains.llm import LLMChain
from langchain.chains.conversation.base import ConversationChain
from langchain.chains import RetrievalQA,  ConversationalRetrievalChain
from langchain.chains.retrieval_qa.base import RetrievalQA
from langchain.chains.conversational_retrieval.base import ConversationalRetrievalChain

conversation = ConversationalRetrievalChain.from_llm(
    llm= llm,
    retriever=retriever,
    return_source_documents =  True,
    return_generated_question = True,
)

# chain = {"context": retriever| format_docs, "question": RunnablePassthrough} | conversation


In [None]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [None]:
chat_history = []
conversation.invoke({"question": "Give me summary in 5 bullet points", "chat_history":chat_history })

In [None]:
output = conversation.invoke({"question": "Tell me more about PL", "chat_history":chat_history })

In [None]:
answer=output['answer']
answer

In [None]:
question = output['question']

In [None]:
from langchain_core.messages import HumanMessage

In [None]:
answer=output['answer']
question = output['question']
from langchain_core.messages import HumanMessage
chat_history.extend([HumanMessage(content = question),answer])

" Privilege Leave (PL) is a type of leave that employees earn based on their service. They are entitled to 21 days of PL in a Leave Year 2. Their earning rate for PL is 1.75 days for every month of service rendered. PL can be availed in units of 0.5 day. Employees must compulsorily avail at least 15 days of PL in a block of 2 years to rejuvenate themselves. In the case of employees who have recently joined, the first block of 2 years starts from January when they earn 21 days of leave. Unavailed compulsory PL will lapse in the Leave Year following the year in which compulsory PL was due. Advance PL can be requested through appropriate channels and will be settled against the PL balance that will be credited in the next Leave Year. However, if leave is credited and availed in advance of entitlement or earning, it will be recovered from employees upon leaving the company's service unless they have earned that leave."

In [None]:
(chat_history)

In [None]:
output = conversation.invoke({"question": "Tell me more about the last question ", "chat_history":chat_history })

In [None]:
retriever_chain = retriever
docs = retriever_chain.invoke("Give me summary in 5 bullet points")
docs
total_pages = 0
for pages in docs:
    total_pages = total_pages + len(pages.page_content)
total_pages    
    

In [None]:
retriever_chain = retriever | format_docs
docs = retriever_chain.invoke("Give me summary in 5 bullet points")
len(docs)


In [None]:
final_chain = retriever_chain | conversation
final_chain.invoke(input= "dfdf")

In [None]:
chain = (
    {"context": retriever| format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

result=chain.invoke("Give me summary in 5 bullet points only")


In [None]:
result
