# Document RAG

In [1]:
# Import all the dependencies
from qdrant_client import models, QdrantClient
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain.vectorstores.qdrant import Qdrant
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain import PromptTemplate
from peft import PeftModel, PeftConfig
#
from tqdm.auto import tqdm
from uuid import uuid4
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from sentence_transformers import SentenceTransformer, util
import pandas as pd

#
import os
import random
import torch
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


## Initialise the Document Store
You will need to customise this section to your specific use case

In [2]:
# Load the document that you need to parse, please change the location to where the pdf resides

# Load 1 PDF file
# loader = PyPDFLoader("/mnt/data/smuckers_poc/RAG/2024-First-Quarter-Results.pdf")
# or load an entire folder
loader = PyPDFDirectoryLoader("/mnt/data/" + os.environ['DOMINO_PROJECT_NAME'] + "/" + os.environ['CUSTOMER_NAME'])
data = loader.load_and_split(RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0))

In [3]:
print(f"There are {len(data)} chunks in the documents")

There are 1975 chunks in the documents


In [4]:
# Pick a sample page
print(data[random.randint(0, len(data))])



In [5]:
# Split the data into pages
metadatas = []
texts = []
for row in data:
  metadatas.append(row.metadata)
  texts.append(row.page_content)
print(len(metadatas),len(texts))

1975 1975


In [7]:
# Load the embedding model and cache it in our artifacts directory
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
embedding_model_name = "BAAI/bge-small-en"
os.environ['SENTENCE_TRANSFORMERS_HOME'] = '/mnt/artifacts/model_cache/'
embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-small-en",
                                      model_kwargs=model_kwargs,
                                      encode_kwargs=encode_kwargs
                                     )

.gitattributes: 100%|██████████| 1.52k/1.52k [00:00<00:00, 213kB/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 29.6kB/s]
README.md: 100%|██████████| 90.8k/90.8k [00:00<00:00, 1.35MB/s]
config.json: 100%|██████████| 684/684 [00:00<00:00, 395kB/s]
config_sentence_transformers.json: 100%|██████████| 124/124 [00:00<00:00, 71.8kB/s]
model.safetensors: 100%|██████████| 133M/133M [00:00<00:00, 283MB/s] 
pytorch_model.bin: 100%|██████████| 134M/134M [00:00<00:00, 393MB/s] 
sentence_bert_config.json: 100%|██████████| 52.0/52.0 [00:00<00:00, 7.82kB/s]
special_tokens_map.json: 100%|██████████| 125/125 [00:00<00:00, 71.3kB/s]
tokenizer.json: 100%|██████████| 711k/711k [00:00<00:00, 2.67MB/s]
tokenizer_config.json: 100%|██████████| 366/366 [00:00<00:00, 197kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 59.8MB/s]
modules.json: 100%|██████████| 349/349 [00:00<00:00, 199kB/s]


### Initialise the Vector Store
Now we can create the collection in the Qdrant Vector Store database.

**Note: This step takes several minutes!**

In [8]:
# Persist the embeddings to disk in our artifacts directory
doc_store = Qdrant.from_texts(texts,
                              metadatas=metadatas,
                              embedding=embeddings,
                              path="/mnt/artifacts/local_qdrant/",
                              prefer_grpc=True,
                              collection_name="nissan")

## Initialise the Model

Now that we have the Vector Store and the Embedding Model we need to get the Foundation Model that we will be using.
In this case we are leveraging the open source Llama2 model Llama-2-7b-chat-hf. In contrast to third party services like OpenAI this open source model allows you to download the model into your cloud and run it entirely in your enterprises ecosystem meaning you have tighter controls over security and governance.

We will:
1. Set up the prompt for this use case
2. Configure bitsandbytes for the quantisation we need
3. Download, configure and save the Llama2 model

1. Set up the prompt

In [6]:
# Setup the prompt template to use for the QA bot
prompt_template = """Use the following pieces of context to answer the question enclosed within  3 backticks at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
Please provide an answer which is factually correct and based on the information retrieved from the vector store.
Please also mention any quotes supporting the answer if any present in the context supplied within two double quotes "" .

{context}

QUESTION:```{question}```
ANSWER:
"""
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context","question"])
#


2. Configure bitsandbytes

In [12]:
# Configure bitsandbytes
chain_type_kwargs = {"prompt": PROMPT}


################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.58s/it]


3. Download and configure the model

In [None]:
model_id = "NousResearch/Llama-2-7b-chat-hf"

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    cache_dir="/mnt/artifacts/model_cache/",
    quantization_config=bnb_config,
    device_map='auto'
)

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.pad_token_id = model.config.eos_token_id

## Putting it all together!

Now we have our Vector Database with our documents in it and our configured model we can creat our RAG QA chain.

In [13]:
# Setup the QA chain
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200)
rag_llm = HuggingFacePipeline(pipeline=pipe)
    
qa_chain = RetrievalQA.from_chain_type(llm=rag_llm,
                                       chain_type="stuff",
                                       chain_type_kwargs={"prompt": PROMPT},
                                       retriever=doc_store.as_retriever(search_kwargs={"k": 5}),
                                       return_source_documents=True
                                      )

NameError: name 'doc_store' is not defined

Now we can test our model!

Run the following cell and ask a question based on the documents you have added to the vector store

In [12]:
# Ask a question
user_question = input("Please provide your question here :")
result = qa_chain(user_question)
print(result['result'])

Please provide your question here : how do I change the key fob battery


