##Loading the document

In [None]:
!pip install llama-index pymupdf langchain
!pip install faiss-cpu
!pip install rank-bm25
!pip install nltk
import nltk
nltk.download('punkt_tab')
!pip install langchain-community
!pip install bitsandbytes accelerate
!pip install uvicorn
!pip install fastapi
!pip install pyngrok

In [None]:
from llama_index.core import SimpleDirectoryReader

document = SimpleDirectoryReader(
    input_files = ["human-nutrition-text.pdf"]
).load_data()



In [None]:
from llama_index.core import Document
import fitz

def extract_text_with_ocr(pdf_path):
  doc = fitz.open(pdf_path)
  full_text = ""
  for page_num in range(len(doc)):
    page = doc.load_page(page_num)
    text = page.get_text("text")
    full_text += text + "\n"
  return full_text

full_text = extract_text_with_ocr("human-nutrition-text.pdf")

combined_doc = Document(text=full_text, metadata={"source":"human-nutrition-text.pdf"})

In [None]:
from google.colab import drive
drive.mount('/content/drive')

##preprocessing and chunking

###chunking the text

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,#target size of each chunk
    chunk_overlap=0,#overlap between chunks for context continuity
    separators = [". "]#split by sections, paragraphs, lines...
)

chunks = text_splitter.split_text(combined_doc.text)

###convert chunks into llamaIndex documents

In [None]:
chunked_docs = []
for i, chunk in enumerate(chunks):
  doc = Document(
      text=chunk,
      metadata={
          "source":"human-nutrition-text.pdf",
          "chunk_id":i,
          "char_count":len(chunk),
          "token_count":len(chunk)/4
      }
  )
  chunked_docs.append(doc)

print(len(chunked_docs))

###cleaning each chunk

In [None]:
import re

def clean_chunk(text):
  #remove page numbers
  text = re.sub(r"\b\d+\b", "", text)
  #replace newlines and extra spaces
  text = re.sub(r"\n+", " ", text) #replace new lines with space
  text = re.sub(r"\s+", " ", text).strip() #collapse multpile spaces
  return text

#clean each chunk
cleaned_chunked_docs = []
for doc in chunked_docs:
  cleaned_text = clean_chunk(doc.text)
  cleaned_doc = Document(
      text = cleaned_text,
      metadata = doc.metadata
  )
  cleaned_chunked_docs.append(cleaned_doc)


###Generating the embeddings

In [None]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-mpnet-base-v2")

embeddings = []
for doc in cleaned_chunked_docs:
  embedding = embedding_model.encode(doc.text)
  embeddings.append(embedding)


##Storing the embeddings in a vector database (FAISS)

In [None]:
import faiss
import numpy as np

#convert embeddings into numpy array
embeddings_array = np.array(embeddings).astype('float32') #FAISS requires float32

#create a FAISS index
dimension = embeddings_array.shape[1]
index = faiss.IndexFlatL2(dimension)

index.add(embeddings_array)

#save the index to a file
faiss.write_index(index, "faiss_index.index")


###building BM25 index
keyword-based search algorithm

In [None]:
# from rank_bm25 import BM25Okapi
# from nltk.tokenize import word_tokenize

# #preprocess chunks for bm25
# tokenized_chunks = [word_tokenize(doc.text.lower()) for doc in cleaned_chunked_docs]
# bm25 = BM25Okapi(tokenized_chunks)

# def bm25_search(query, top_k=5):
#   tokenized_query = word_tokenize(query.lower())
#   scores = bm25.get_scores(tokenized_query)
#   top_indices = scores.argsort()[-top_k:][::-1]
#   return top_indices.tolist()

###saving chunk texts and bm25 retriever to a file

In [None]:
# import pickle
# import json

# chunk_texts = [doc.text for doc in cleaned_chunked_docs]
# with open("chunk_texts.pkl", "wb") as f:
#   pickle.dump(chunk_texts, f)

# with open("bm25_retriever.pkl", "wb") as f:
#   pickle.dump(bm25, f)

###creating FAISS retriever class

In [None]:
from langchain.schema.retriever import BaseRetriever
from langchain.schema.document import Document
import faiss
import numpy as np
from pydantic import Field
from typing import List

# Create FAISS retriever
class FAISSRetriever(BaseRetriever):
  index: faiss.Index = Field(index)
  chunk_texts: List[str] = Field(cleaned_chunked_docs)

  def __init__(self, index: faiss.Index, chunk_texts:list):
        super().__init__()
        self.index = index
        self.chunk_texts = chunk_texts

  def _get_relevant_documents(self, query:str, top_k:int=5) -> list[Document]:
        query_embedding = embedding_model.encode(query).astype('float32').reshape(1, -1) #generate query embedding
        distances, indices = self.index.search(query_embedding, top_k) #search FAISS index
        return [ #return documents as list of document objects
            Document(
                page_content = self.chunk_texts[idx],
                metadata = {"source":"human-nutrition-text.pdf"}
            )
            for idx in indices[0]
        ]

##Hybrid search implementation

In [None]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

# Create BM25 retriever
bm25_retriever = BM25Retriever.from_texts(
    texts=[doc.text for doc in cleaned_chunked_docs],
    tokenizer=word_tokenize
)
bm25_retriever.k = 5  # Number of BM25 results

#create faiss retriver (inherits from BaseRetriever)
faiss_retriever = FAISSRetriever(index, [doc.text for doc in cleaned_chunked_docs])

#combine retrievers
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever],
    weights = [0.6,0.4]
)

#perform hybrid search
query = "role of fibers"
hybrid_results = ensemble_retriever.invoke(query,top_k=5)

#extract texts from hybrid results
hybrid_texts = [doc.page_content for doc in hybrid_results]

In [None]:
with open("bm25_retriever.pkl", "wb") as f:
  pickle.dump(bm25_retriever, f)

##Integrating a Reranker

In [None]:
from sentence_transformers import CrossEncoder

reranker_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def rerank(query, documents, top_k=5):
  #create query chunk pairs
  pairs = [[query, doc.page_content] for doc in documents]

  #perdict relevance scores
  scores = reranker_model.predict(pairs)

  #sort documents by score
  scored_docs = list(zip(documents, scores))
  scored_docs.sort(key=lambda x: x[1], reverse=True)

  #return top_k reranked documents
  return [doc for doc, _ in scored_docs[:top_k]]

In [None]:
#perform hybrid search
query = "what are the benefits of antioxydants?"
hybrid_results = ensemble_retriever.invoke(query, top_k=10)

#rerank the hybrid results
reranked_results = rerank(query, hybrid_results, top_k=5)

#extract text from reranked results
reranked_texts = [doc.page_content for doc in reranked_results]

print("reranked chunks")
for i, text in enumerate(reranked_texts):
  print(f"chunk {i+1}: {text}")

In [None]:
def construct_context(texts):
  context = ""
  for i, text in enumerate(texts, start=1):
    context += f"chunk {i}:\n {text}\n\n"
  return context.strip()

##Generating response with an LLM

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
from langchain.llms import HuggingFacePipeline


#Load the quantized LLM
model_id = "google/gemma-7b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id)
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config, device_map="auto")



In [None]:
#create a text generation pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=750,
    temperature=0.7, #control creativity
    do_sample=True
)

#wrap in LangChain's HuggingFacePipeline
llm  = HuggingFacePipeline(pipeline=pipe)

###define the prompt template

In [None]:
from langchain.prompts import PromptTemplate

prompt_template = PromptTemplate(
    template = """
    You are a nutrition expert. Answer the question based ***only*** on the provided context.
    if the context does not contain the answer, respond with "I dont't know"
    ***NOTE***:you can use your knowledge to generate some ***nutrition** related information IF AND ONLY IF the provided context did not contain much relevant info
    Context:
    {context}

    Question: {question}
    """,
    input_variables=["context","question"]
)

In [None]:
#Example query
query = """explain the role of proteins in the body, and how can someone increase their protein intake, cite from which chapter or section you brought the info
     """
# Retrieve, rerank and construct context
hybrid_results = ensemble_retriever.invoke(query, top_k=10)
reranked_results = rerank(query, hybrid_results,top_k=5)
context = construct_context([doc.page_content for doc in reranked_results])

#Format the prompt
prompt = prompt_template.format(context=context, question=query)

#generate the response
response = llm(prompt)

#clean and display the response
cleaned_response = response.strip().replace("</s>", "")
print(f'Answer:\n{cleaned_response}')

In [None]:
%%writefile rag_pipeline.py
from sentence_transformers import SentenceTransformer, CrossEncoder
import faiss
import pickle
from langchain.schema.document import Document
from langchain.schema.retriever import BaseRetriever
from langchain.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
from pydantic import Field
from typing import List

# Load components from disk
index = faiss.read_index("faiss_index.index")  # FAISS index
with open("chunk_texts.pkl", "rb") as f:
    chunk_texts = pickle.load(f)  # List of cleaned text chunks

# Load BM25 retriever (LangChain format)
with open("bm25_retriever.pkl", "rb") as f:
    bm25_retriever = pickle.load(f)

# Initialize models
embedding_model = SentenceTransformer("all-mpnet-base-v2")  # Load embedding model
reranker_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

# Initialize LLM
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model_id = "google/gemma-7b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=750)
llm = HuggingFacePipeline(pipeline=pipe)

# Create FAISS retriever
class FAISSRetriever(BaseRetriever):
  index: faiss.Index = Field(index)
  chunk_texts: List[str] = Field(chunk_texts)

  def __init__(self, index: faiss.Index, chunk_texts:list):
        super().__init__()
        self.index = index
        self.chunk_texts = chunk_texts

  def _get_relevant_documents(self, query:str, top_k:int=5) -> list[Document]:
        query_embedding = embedding_model.encode(query).astype('float32').reshape(1, -1) #generate query embedding
        distances, indices = self.index.search(query_embedding, top_k) #search FAISS index
        return [ #return documents as list of document objects
            Document(
                page_content = self.chunk_texts[idx],
                metadata = {"source":"human-nutrition-text.pdf"}
            )
            for idx in indices[0]
        ]
faiss_retriever = FAISSRetriever(index, chunk_texts)

# Combine retrievers
from langchain.retrievers import EnsembleRetriever

ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever],
    weights=[0.5, 0.5]
)

# Hybrid search
def hybrid_search(query, top_k=5):
    return ensemble_retriever.invoke(query, top_k=top_k)

# Rerank results
def rerank(query, documents, top_k=5):
    pairs = [[query, doc.page_content] for doc in documents]
    scores = reranker_model.predict(pairs)
    scored_docs = list(zip(documents, scores))
    scored_docs.sort(key=lambda x: x[1], reverse=True)
    return [doc for doc, _ in scored_docs[:top_k]]

# Construct context
def construct_context(texts):
    context = ""
    for i, text in enumerate(texts, start=1):
        context += f"Chunk {i}:\n{text}\n\n"
    return context.strip()

# Generate response
def generate_response(query, context):
    prompt_template = """
    You are a nutrition expert. Answer the question based **only** on the provided context.
    If the context does not contain the answer, respond with "I don't know."

    Context:
    {context}

    Question: {question}
    """
    prompt = prompt_template.format(context=context, question=query)
    return llm(prompt).strip()

In [None]:
%%writefile main.py

from fastapi import FastAPI
from pydantic import BaseModel
from rag_pipeline import (
    hybrid_search,
    rerank,
    construct_context,
    generate_response
)

app = FastAPI()

class QueryRequest(BaseModel):
  query:str

@app.post("/ask")
async def ask(request: QueryRequest):
  query = request.query
  results = hybrid_search(query,top_k=10)
  reranked_results = rerank(query,results,top_k=5)
  context = construct_context([doc.page_content for doc in reranked_results])
  answer = generate_response(query, context)
  return {"answer":answer}

In [None]:
%run rag_pipeline.py

In [None]:
# Import functions
from rag_pipeline import hybrid_search, rerank, construct_context, generate_response

# Example query
query = "What are the benefits of antioxidants?"

# Step 1: Hybrid search
results = hybrid_search(query, top_k=10)

# Step 2: Rerank results
reranked_results = rerank(query, results, top_k=5)

# Step 3: Construct context
context = construct_context([doc.page_content for doc in reranked_results])

# Step 4: Generate response
response = generate_response(query, context)

# Print the final answer
print("Query:", query)
print("Answer:", response)

In [None]:
import subprocess

# Start Uvicorn in the background
uvicorn_process = subprocess.Popen(
    ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
)

print("Uvicorn server started in the background.")

In [None]:
from pyngrok import ngrok

# Set your ngrok token
ngrok.set_auth_token("2u0s2RgPMdfotGIpYtjj7ZjyCVf_njGnsKoRVivYDddyKqS")

print("ngrok authenticated successfully.")

In [None]:
from pyngrok import ngrok

# Connect ngrok to port 8000
public_url = ngrok.connect(8000)
print(f"Public URL: {public_url}")

In [None]:
import requests

response = requests.post(
    "https://255d-34-143-162-16.ngrok-free.app/ask",  # Replace with your ngrok URL
    json={"query": "What are the benefits of antioxidants?"}
)

# Print the raw response
print("Status Code:", response.status_code)
print(response.json())

In [None]:
uvicorn_process.terminate()
print("Uvicorn server stopped.")