<a href="https://colab.research.google.com/github/falconflightX/LLM-Cookbook/blob/main/RAG_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installing necessary libraries
We use huggingface models, experiment with langchain and llama2

In [1]:
## code to auto login to hugging face, avoid the login prompt

!pip install -U huggingface-hub

# get your account token from https://huggingface.co/settings/tokens
token = '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'

from huggingface_hub import login
login(token=token, add_to_git_credential=True)

Token is valid (permission: read).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


#### Installing and importing necessary packages

In [2]:
#!pip install -U pip
%pip install torch
%pip install transformers
%pip install langchain
%pip install chromadb
%pip install pypdf
%pip install xformers
%pip install sentence_transformers
%pip install InstructorEmbedding
%pip install pdf2image
%pip install pycryptodome
%pip install cython cchardet
%pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/  ## Quantization package
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U einops
!pip install -q -U ctransformers[cuda]


import torch
from auto_gptq import AutoGPTQForCausalLM   # We prefer this over the bits&bytes quantization: https://huggingface.co/blog/overview-quantization-transformers#:~:text=What%20are%20the%20benefits%20of%20autoGPTQ%3F,models%20up%20to%202%20bits!
from langchain import HuggingFacePipeline, PromptTemplate
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from pdf2image import convert_from_path
from transformers import AutoTokenizer, TextStreamer, pipeline
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

Collecting xformers
  Using cached xformers-0.0.23.post1-cp310-cp310-manylinux2014_x86_64.whl (213.0 MB)
Collecting torch==2.1.2 (from xformers)
  Using cached torch-2.1.2-cp310-cp310-manylinux1_x86_64.whl (670.2 MB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.1.2->xformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.1.2->xformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.1.2->xformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.1.2->xformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.1.2->xformers)
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.wh

In [3]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [4]:
loader = PyPDFDirectoryLoader("/content/documents/")
docs = loader.load()
len(docs)

3318

#### Embedding model

In [5]:
embeddings = HuggingFaceInstructEmbeddings(
    model_name="hkunlp/instructor-large", model_kwargs={"device": DEVICE}    ## Try: 'sentence-transformers/all-MiniLM-L6-v2'
)

.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/66.3k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.41k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer


  return self.fget.__get__(instance, owner)()


max_seq_length  512


In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)  ## Try building custom text splitters
texts = text_splitter.split_documents(docs)
len(texts)

6147

  ###### We use chromadb for storing the embeddings

In [7]:
vectordb = Chroma.from_documents(texts, embeddings, persist_directory="db")

#### LLM input

In [8]:
import torch
import transformers

model_id = "mistralai/Mistral-7B-Instruct-v0.1"

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_config = transformers.AutoConfig.from_pretrained(
    model_id
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto'
)

model.eval()

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
   

In [9]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id
)

tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

#### Feeding into the langchain pipeline
###### We first prepare the pipeline

In [10]:
generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    return_full_text=True,
    temperature=0.0,
    max_new_tokens=256
)

In [11]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

In [12]:
DEFAULT_SYSTEM_PROMPT = """
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, please don't share false information.
""".strip()


def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
    return f"""
[INST] <<SYS>>
{system_prompt}
<</SYS>>

{prompt} [/INST]
""".strip()

In [13]:
SYSTEM_PROMPT = "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer."

template = generate_prompt(
    """
{context}

Question: {question}
""",
    system_prompt=SYSTEM_PROMPT,
)

In [14]:
prompt = PromptTemplate(template=template, input_variables=["context", "question"])

In [15]:
retriever = vectordb.as_retriever(search_kwargs={"k": 5})

In [29]:
from langchain.chains import RetrievalQA
from langchain.callbacks import StdOutCallbackHandler

handler = StdOutCallbackHandler()

qa_with_sources_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    callbacks=[handler],
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)


In [30]:
qa_with_sources_chain({"query" : "what is the normal maintenance schedule for Hyundai i20?"})

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.




[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


{'query': 'what is the normal maintenance schedule for Hyundai i20?',
 'result': ' The normal maintenance schedule for Hyundai i20 is not specified in the provided context. However, it is mentioned that the maintenance schedule for Hyundai i20 is similar to the maintenance schedule for other Hyundai models. The maintenance schedule includes items such as engine oil and filter changes, drive belt inspections, fuel filter replacement, and tire pressure adjustments. The frequency of these maintenance tasks depends on the number of months or kilometers driven, as well as the operating conditions of the vehicle. It is recommended to consult an authorized Hyundai dealer for specific details on the maintenance schedule for your Hyundai i20.',
 'source_documents': [Document(page_content='9-16Maintenance\nEngine oil and filter\nThe engine oil and filter should be \nchanged at the intervals specified in the maintenance schedule. If the car is being driven in severe conditions, more frequent oil 

In [18]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    verbose=True
)

def run_my_rag(qa, query):
    print(f"Query: {query}\n")
    result = qa.run(query)
    print("\nResult: ", result)

### Ask Queries Now
query =""" """
run_my_rag(qa, query)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Query:  



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m

Result:    

I don't know.


#### Try hybrid search

In [23]:
from langchain.retrievers import BM25Retriever,EnsembleRetriever

bm25_retriever = BM25Retriever.from_documents(texts)
bm25_retriever.k=5

In [24]:
vector_retriever = vectordb.as_retriever(search_kwargs={"k":5})
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever,vector_retriever],
                                       weights=[0.5,0.5])

In [27]:
from langchain.chains import RetrievalQA
from langchain.callbacks import StdOutCallbackHandler

handler = StdOutCallbackHandler()

qa_with_sources_chain_hybrid = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=ensemble_retriever,
    callbacks=[handler],
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)

In [28]:
qa_with_sources_chain_hybrid({"query" : "what is the normal maintenance schedule for Hyundai i20?"})



[1m> Entering new RetrievalQA chain...[0m


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



[1m> Finished chain.[0m


{'query': 'what is the normal maintenance schedule for Hyundai i20?',
 'result': ' The normal maintenance schedule for Hyundai i20 is not specified in the provided context. However, it is mentioned that the maintenance schedule for engine oil replacement is to prevent oil deterioration and it is irrelevant to oil consumption. The recommended engine oil and filter should be used, and if the recommended oil and filter are not used, the maintenance schedule should be followed according to the maintenance schedule under severe usage conditions.',
 'source_documents': [Document(page_content='9-16Maintenance\nEngine oil and filter\nThe engine oil and filter should be \nchanged at the intervals specified in the maintenance schedule. If the car is being driven in severe conditions, more frequent oil and filter changes are required.\nDrive belts\nInspect all drive belts for evidence of cuts, cracks, excessive wear or oil saturation and replace if necessary. Drive belts should be checked periodi

### Try extracting tables