In [1]:
# Import all the dependencies
from qdrant_client import models, QdrantClient
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain.vectorstores.qdrant import Qdrant
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain import PromptTemplate
from peft import PeftModel, PeftConfig
#
from tqdm.auto import tqdm
from uuid import uuid4
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from sentence_transformers import SentenceTransformer, util
import pandas as pd

#
import os
import random
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Load the document that you need to parse, please change the location to where the pdf resides

# Load 1 PDF file


# loader = PyPDFDirectoryLoader("/mnt/data/" + os.environ['DOMINO_PROJECT_NAME'] + "/RAG/" + pdf_name)

# or load an entire folder
loader = PyPDFDirectoryLoader("/mnt/data/" + os.environ['DOMINO_PROJECT_NAME'] + "/RAG_Docs/")
data = loader.load_and_split(RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0))

In [6]:
print(f"There are {len(data)} pages in the document")

There are 52 pages in the document


In [7]:
# Pick a sample page
print(data[random.randint(0, len(data))])

page_content='smoothinglengthoverlapswithanexistingclumpandiftheparticle\nis bound to the clump. If so and if 𝜌 > 𝜌member=10−13g cm−3,\nthenitisaddedtotheclumpandtheclumppropertiesareupdatedto\naccount for the new member. If not and 𝜌 > 𝜌lead, then it becomes\ntheleadmemberofanewclump.Whenaparticle’ssmoothinglength\noverlaps with two clumps, or if two clumps overlap, we determine\nif they are bound, and if so, we merge the clumps. These values\nare chosen to help the clump-finding algorithm distinguish clumps\nfrom the spiral arm they reside in. The impact of our choices do not\nmeaningfully affect the analysis of the clumps.\nSinceweareevaluatingclumpmembershipbyinspectingparticles\ninorderofdecreasingdensityratherthanspatialproximitytoaclump,\nwerepeatthisprocessiterativelyuntilthenumberofclumpsandtheir\nproperties have converged. This is required since each iteration will\nfindmoredistantparticleswhosesmoothinglengthwilloverlapwith\nthe clump. Only dust particles have been considere

In [8]:
# Split the data into pages
metadatas = []
texts = []
for row in data:
  metadatas.append(row.metadata)
  texts.append(row.page_content)
print(len(metadatas),len(texts))

52 52


In [9]:
# Setup the prompt template to use for the QA bot
prompt_template = """Use the following pieces of context to answer the question enclosed within  3 backticks at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
Please provide an answer which is factually correct and based on the information retrieved from the vector store.
Please also mention any quotes supporting the answer if any present in the context supplied within two double quotes "" .

{context}

QUESTION:```{question}```
ANSWER:
"""
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context","question"])
#


In [10]:
# Load the embedding model
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
embedding_model_name = "BAAI/bge-small-en"
os.environ['SENTENCE_TRANSFORMERS_HOME'] = '/mnt/data/' + os.environ['DOMINO_PROJECT_NAME'] + '/model_cache/'
embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-small-en",
                                      model_kwargs=model_kwargs,
                                      encode_kwargs=encode_kwargs
                                     )

.gitattributes: 100%|██████████| 1.52k/1.52k [00:00<00:00, 208kB/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 35.2kB/s]
README.md: 100%|██████████| 90.8k/90.8k [00:00<00:00, 36.7MB/s]
config.json: 100%|██████████| 684/684 [00:00<00:00, 325kB/s]
config_sentence_transformers.json: 100%|██████████| 124/124 [00:00<00:00, 72.1kB/s]
model.safetensors: 100%|██████████| 133M/133M [00:00<00:00, 202MB/s]  
pytorch_model.bin: 100%|██████████| 134M/134M [00:00<00:00, 229MB/s] 
sentence_bert_config.json: 100%|██████████| 52.0/52.0 [00:00<00:00, 7.67kB/s]
special_tokens_map.json: 100%|██████████| 125/125 [00:00<00:00, 135kB/s]
tokenizer.json: 100%|██████████| 711k/711k [00:00<00:00, 5.21MB/s]
tokenizer_config.json: 100%|██████████| 366/366 [00:00<00:00, 259kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.65MB/s]
modules.json: 100%|██████████| 349/349 [00:00<00:00, 189kB/s]


In [11]:
# Uncomment this code if you want to store the embeddings in Qdrant in-memory
doc_store = Qdrant.from_texts(texts,
                              metadatas=metadatas,
                              embedding=embeddings,
                              location=":memory:",
                              collection=f"{embedding_model_name}_arxiv_paper")

In [None]:
 # Persist the embeddings to disk
# doc_store = Qdrant.from_texts(texts,
#                               metadatas=metadatas,
#                               embedding=embeddings,
#                               path="/mnt/artifacts/local_qdrant/",
#                               prefer_grpc=True,
#                               collection=f"{embedding_model_name}_arxiv_paper")

In [None]:
# Load the model and the tokenizer
chain_type_kwargs = {"prompt": PROMPT}


################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

model_id = "NousResearch/Llama-2-7b-chat-hf"

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    cache_dir="/mnt/data/" + os.environ['DOMINO_PROJECT_NAME'] + "/model_cache/",
    quantization_config=bnb_config,
    device_map='auto'
)

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.pad_token_id = model.config.eos_token_id

config.json: 100%|██████████| 583/583 [00:00<00:00, 264kB/s]
model.safetensors.index.json: 100%|██████████| 26.8k/26.8k [00:00<00:00, 11.7MB/s]
Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]
model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s][A
model-00001-of-00002.safetensors:   0%|          | 31.5M/9.98G [00:00<00:41, 238MB/s][A
model-00001-of-00002.safetensors:   1%|          | 62.9M/9.98G [00:00<00:42, 234MB/s][A
model-00001-of-00002.safetensors:   1%|          | 94.4M/9.98G [00:00<00:42, 231MB/s][A
model-00001-of-00002.safetensors:   1%|▏         | 126M/9.98G [00:00<00:43, 229MB/s] [A
model-00001-of-00002.safetensors:   2%|▏         | 157M/9.98G [00:00<00:42, 229MB/s][A
model-00001-of-00002.safetensors:   2%|▏         | 189M/9.98G [00:00<00:43, 227MB/s][A
model-00001-of-00002.safetensors:   2%|▏         | 220M/9.98G [00:00<00:42, 229MB/s][A
model-00001-of-00002.safetensors:   3%|▎         | 252M/9.98G [00:01<00:42, 230MB/s][A
model-000

In [None]:
# Setup the QA chain
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=100)
rag_llm = HuggingFacePipeline(pipeline=pipe)
    
qa_chain = RetrievalQA.from_chain_type(llm=rag_llm,
                                       chain_type="stuff",
                                       chain_type_kwargs={"prompt": PROMPT},
                                       retriever=doc_store.as_retriever(search_kwargs={"k": 5}),
                                       return_source_documents=True
                                      )

In [None]:
# Ask a question
user_question = input("Please provide your question here :")
result = qa_chain(user_question)
result['result']

# What the role of gravitational and drag forces on the concentration of large dust grains?