<a href="https://colab.research.google.com/github/chawbel/rag_project_enhanced/blob/main/Copy_of_rag_pipeline_enhanced.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Loading the document

In [None]:
!pip install llama-index pymupdf langchain
!pip install faiss-cpu
!pip install rank-bm25
!pip install nltk
import nltk
nltk.download('punkt_tab')
!pip install langchain-community
!pip install bitsandbytes accelerate

In [None]:
from llama_index.core import SimpleDirectoryReader

document = SimpleDirectoryReader(
    input_files = ["human-nutrition-text.pdf"]
).load_data()


In [None]:
from llama_index.core import Document
import fitz

def extract_text_with_ocr(pdf_path):
  doc = fitz.open(pdf_path)
  full_text = ""
  for page_num in range(len(doc)):
    page = doc.load_page(page_num)
    text = page.get_text("text")
    full_text += text + "\n"
  return full_text

full_text = extract_text_with_ocr("human-nutrition-text.pdf")

combined_doc = Document(text=full_text, metadata={"source":"human-nutrition-text.pdf"})

##preprocessing and chunking

###chunking the text

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,#target size of each chunk
    chunk_overlap=0,#overlap between chunks for context continuity
    separators = [". "]#split by sections, paragraphs, lines...
)

chunks = text_splitter.split_text(combined_doc.text)

###convert chunks into llamaIndex documents

In [None]:
chunked_docs = []
for i, chunk in enumerate(chunks):
  doc = Document(
      text=chunk,
      metadata={
          "source":"human-nutrition-text.pdf",
          "chunk_id":i,
          "char_count":len(chunk),
          "token_count":len(chunk)/4
      }
  )
  chunked_docs.append(doc)

print(len(chunked_docs))

971


###cleaning each chunk

In [None]:
import re

def clean_chunk(text):
  #remove page numbers
  text = re.sub(r"\b\d+\b", "", text)
  #replace newlines and extra spaces
  text = re.sub(r"\n+", " ", text) #replace new lines with space
  text = re.sub(r"\s+", " ", text).strip() #collapse multpile spaces
  return text

#clean each chunk
cleaned_chunked_docs = []
for doc in chunked_docs:
  cleaned_text = clean_chunk(doc.text)
  cleaned_doc = Document(
      text = cleaned_text,
      metadata = doc.metadata
  )
  cleaned_chunked_docs.append(cleaned_doc)


###Generating the embeddings

In [None]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-mpnet-base-v2")

embeddings = []
for doc in cleaned_chunked_docs:
  embedding = embedding_model.encode(doc.text)
  embeddings.append(embedding)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

##Storing the embeddings in a vector database (FAISS)

In [None]:
import faiss
import numpy as np

#convert embeddings into numpy array
embeddings_array = np.array(embeddings).astype('float32') #FAISS requires float32

#create a FAISS index
dimension = embeddings_array.shape[1]
index = faiss.IndexFlatL2(dimension)

index.add(embeddings_array)



###building BM25 index
keyword-based search algorithm

In [None]:
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize

#preprocess chunks for bm25
tokenized_chunks = [word_tokenize(doc.text.lower()) for doc in cleaned_chunked_docs]
bm25 = BM25Okapi(tokenized_chunks)

def bm25_search(query, top_k=5):
  tokenized_query = word_tokenize(query.lower())
  scores = bm25.get_scores(tokenized_query)
  top_indices = scores.argsort()[-top_k:][::-1]
  return top_indices.tolist()

###creating FAISS retriever class

In [None]:
from langchain.schema.retriever import BaseRetriever
from langchain.schema.document import Document
import faiss
import numpy as np
from pydantic import Field
from typing import List

# Create FAISS retriever
class FAISSRetriever(BaseRetriever):
  index: faiss.Index = Field(index)
  chunk_texts: List[str] = Field(cleaned_chunked_docs)

  def __init__(self, index: faiss.Index, chunk_texts:list):
        super().__init__()
        self.index = index
        self.chunk_texts = chunk_texts

  def _get_relevant_documents(self, query:str, top_k:int=5) -> list[Document]:
        query_embedding = embedding_model.encode(query).astype('float32').reshape(1, -1) #generate query embedding
        distances, indices = self.index.search(query_embedding, top_k) #search FAISS index
        return [ #return documents as list of document objects
            Document(
                page_content = self.chunk_texts[idx],
                metadata = {"source":"human-nutrition-text.pdf"}
            )
            for idx in indices[0]
        ]

##Hybrid search implementation

In [None]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

# Create BM25 retriever
bm25_retriever = BM25Retriever.from_texts(
    texts=[doc.text for doc in cleaned_chunked_docs],
    tokenizer=word_tokenize
)
bm25_retriever.k = 5  # Number of BM25 results

#create faiss retriver (inherits from BaseRetriever)
faiss_retriever = FAISSRetriever(index, [doc.text for doc in cleaned_chunked_docs])

#combine retrievers
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever],
    weights = [0.6,0.4]
)

#perform hybrid search
query = "role of fibers"
hybrid_results = ensemble_retriever.invoke(query,top_k=5)

#extract texts from hybrid results
hybrid_texts = [doc.page_content for doc in hybrid_results]

##Integrating a Reranker

In [None]:
from sentence_transformers import CrossEncoder

reranker_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def rerank(query, documents, top_k=5):
  #create query chunk pairs
  pairs = [[query, doc.page_content] for doc in documents]

  #perdict relevance scores
  scores = reranker_model.predict(pairs)

  #sort documents by score
  scored_docs = list(zip(documents, scores))
  scored_docs.sort(key=lambda x: x[1], reverse=True)

  #return top_k reranked documents
  return [doc for doc, _ in scored_docs[:top_k]]

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
#perform hybrid search
query = "what are the benefits of antioxydants?"
hybrid_results = ensemble_retriever.invoke(query, top_k=10)

#rerank the hybrid results
reranked_results = rerank(query, hybrid_results, top_k=5)

#extract text from reranked results
reranked_texts = [doc.page_content for doc in reranked_results]

print("reranked chunks")
for i, text in enumerate(reranked_texts):
  print(f"chunk {i+1}: {text}")

reranked chunks
chunk 1: . Table . Some Antioxidants Obtained from Diet and Their Related Functions Antioxidant Antioxidant Source Antioxidant Function Vitamin A Karat banana, beef liver, chicken liver Protects cellular membranes, prevents glutathione depletion, maintains free radical detoxifying enzyme systems, reduces inflammation Vitamin E Sunflower seeds, almonds, sunflower oil Protects cellular membranes, prevents glutathione depletion Vitamin C Oranges, grapefruit Protects DNA, RNA, proteins, and lipids, aids in regenerating vitamin E Vitamin D Swordfish, salmon, tuna fish canned in water and drained Regulates blood calcium levels in concert with parathyroid hormone Carotenoids Pumpkin, carrots Free radical scavenger Antioxidants | Learning Activities Technology Note: The second edition of the Human Nutrition Open Educational Resource (OER) textbook features interactive learning activities. These activities are available in the web-based textbook and not available in the download

In [None]:
def construct_context(texts):
  context = ""
  for i, text in enumerate(texts, start=1):
    context += f"chunk {i}:\n {text}\n\n"
  return context.strip()

##Generating response with an LLM

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
from langchain.llms import HuggingFacePipeline


#Load the quantized LLM
model_id = "google/gemma-7b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id)
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config, device_map="auto")



In [None]:
#create a text generation pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=750,
    temperature=0.7, #control creativity
    do_sample=True
)

#wrap in LangChain's HuggingFacePipeline
llm  = HuggingFacePipeline(pipeline=pipe)

Device set to use cuda:0


###define the prompt template

In [None]:
from langchain.prompts import PromptTemplate

prompt_template = PromptTemplate(
    template = """
    You are a nutrition expert. Answer the question based ***only*** on the provided context.
    if the context does not contain the answer, respond with "I dont't know"
    ***NOTE***:you can use your knowledge to generate some ***nutrition** related information IF AND ONLY IF the provided context did not contain much relevant info
    Context:
    {context}

    Question: {question}
    """,
    input_variables=["context","question"]
)

In [None]:
#Example query
query = """explain the role of proteins in the body, and how can someone increase their protein intake, cite from which chapter or section you brought the info
     """
# Retrieve, rerank and construct context
hybrid_results = ensemble_retriever.invoke(query, top_k=10)
reranked_results = rerank(query, hybrid_results,top_k=5)
context = construct_context([doc.page_content for doc in reranked_results])

#Format the prompt
prompt = prompt_template.format(context=context, question=query)

#generate the response
response = llm(prompt)

#clean and display the response
cleaned_response = response.strip().replace("</s>", "")
print(f'Answer:\n{cleaned_response}')

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Answer:
You are a nutrition expert. Answer the question based ***only*** on the provided context.
    if the context does not contain the answer, respond with "I dont't know"
    ***NOTE***:you can use your knowledge to generate some ***nutrition** related information IF AND ONLY IF the provided context did not contain much relevant info 
    Context:
    chunk 1:
 . You can view it online here: https://pressbooks.oer.hawaii.edu/ humannutrition2e22/?p=#h5p- The Role of Proteins in Foods: Cooking and Denaturation | Figure . Digestion and Absorption of Protein Image by Allison Calabrese / CC BY . Protein Digestion and Absorption How do the proteins from foods, denatured or not, get processed into amino acids that cells can use to make new proteins? When you eat food the body’s digestive system breaks down the protein into the individual amino acids, which are absorbed and used by cells to build other proteins and a few other macromolecules, such as DNA. We previously discussed the genera