<a href="https://colab.research.google.com/github/diyanigam/pdf-upload/blob/treebranch_straight/RAG_Chatbot_with_Gemma_2B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U bitsandbytes
!pip install fitz langchain_chroma langchain_huggingface

In [None]:
import os
import shutil
import re
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
import torch

PERSIST_DIRECTORY = "./chroma_phi"
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
GEMMA_MODEL_NAME = "google/gemma-2b-it"

embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)

In [None]:

vectordb = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embedding_model)
# Create a retriever from the vector database.
# The 'search_kwargs={"k": 7}' means it will retrieve the top 7 most relevant documents.
# We're keeping 'k' high to ensure enough relevant sections are retrieved.
retriever = vectordb.as_retriever(search_kwargs={"k": 10})

# --- 3. Load the Gemma 2B LLM ---
# Configure BitsAndBytes for 4-bit quantization. This makes the model
# significantly smaller in memory and faster, especially on GPUs.
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load the tokenizer for Gemma 2B. This is crucial for correctly encoding and decoding text.
tokenizer = AutoTokenizer.from_pretrained(GEMMA_MODEL_NAME, trust_remote_code=True)
# Load the Gemma 2B model with the specified quantization configuration.
# 'device_map="auto"' automatically distributes the model across available GPUs.
model = AutoModelForCausalLM.from_pretrained(
    GEMMA_MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Gemma's tokenizer might not have a default pad_token. Setting it to eos_token
# helps with batching and generation, especially when max_length is involved.
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Create a HuggingFace pipeline for text generation.
llm_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=768, # Increased max_new_tokens for more comprehensive answers
    do_sample=True,
    top_k=50,
    temperature=0.7,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id
)

# --- Custom LLM Wrapper for LangChain Compatibility ---
class CustomHuggingFaceLLM:
    def __init__(self, pipeline, tokenizer):
        self.pipeline = pipeline
        self.tokenizer = tokenizer

    def invoke(self, prompt_value) -> str:
        messages = prompt_value.messages
        formatted_prompt = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

        result = self.pipeline(formatted_prompt)
        generated_text = result[0]['generated_text']

        if generated_text.startswith(formatted_prompt):
            clean_response = generated_text[len(formatted_prompt):].strip()
        else:
            clean_response = generated_text.strip()

        return clean_response

llm = CustomHuggingFaceLLM(llm_pipeline, tokenizer)

# --- 4. Define your Prompt Template (HIGHLY IMPROVED FOR STRUCTURED DATA) ---
# This template is crucial. It explicitly tells Gemma about the structure of the context.
template = """
You are a helpful and knowledgeable AI assistant. Your purpose is to act as a personal chatbot for a user named [YOUR NAME HERE].
You have access to a knowledge base containing specific sections about [YOUR NAME HERE]'s resume, projects, and personal details.

Each piece of information in the context is a distinct section, often starting with a clear heading like "Name of project:", "Introduction:", "Summary:", "Technologies Used:", "Objectives:", "Problem Statement:", "Methodology:", "Key Components:", "Implementation Details:", "Results:", "Learnings:", "Future Scope:", "Linked Resources:", or similar.

Your task is to answer user questions *only* using the provided context.
When answering, identify the relevant sections from the context and synthesize the information from those sections to form a comprehensive and accurate answer.
If a question asks about a project, look for sections related to that project's name, introduction, summary, technologies, etc.
If the provided context does not contain enough information to answer the question, please politely state: "I apologize, but I don't have enough information in my knowledge base to answer that question." Do not try to make up an answer.

Context:
{context}

Question: {question}

Answer:
"""
prompt = ChatPromptTemplate.from_template(template)

# --- 5. Construct the RAG Chain ---
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | RunnableLambda(llm.invoke)
    | StrOutputParser()
)

# --- 6. Chat Function ---
def chat_with_gemma(query: str):
    response = rag_chain.invoke(query)
    return response

# --- Example Usage ---
if __name__ == "__main__":
    # --- IMPORTANT: Replace [YOUR NAME HERE] in the template above with your actual name! ---
    # Example: template = "... chatbot for a user named John Doe. ..."

    print("\nWelcome to your personal RAG chatbot with Gemma 2B! Ask me anything about yourself.")
    print("Type 'exit' to quit.")

    while True:
        user_query = input("\nYou: ")
        if user_query.lower() == 'exit':
            print("Goodbye!")
            break
        response = chat_with_gemma(user_query)
        print(f"Bot: {response}")

In [None]:
# --- Configuration ---
# This is the directory where your Chroma database is persisted.
# Make sure it matches the directory you used when populating the database.
PERSIST_DIRECTORY = "./chroma_phi"
# The embedding model used must be the same as the one used to create the Chroma DB.
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
# Using the instruction-tuned version of Gemma 2B for better chat performance.
GEMMA_MODEL_NAME = "google/gemma-2b-it"

# --- 1. Load your Embedding Model ---
# This model is used to convert text queries into numerical vectors (embeddings)
# to find relevant documents in your Chroma database.
embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)

In [None]:
# --- 2. Load your Chroma Vector Database ---
# Initialize the Chroma vector database from the persisted directory.
# It uses the same embedding function to ensure compatibility.
vectordb = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embedding_model)
# Create a retriever from the vector database.
# The 'search_kwargs={"k": 3}' means it will retrieve the top 3 most relevant documents.
retriever = vectordb.as_retriever(search_kwargs={"k": 10})

In [None]:
!pip install huggingface_hub
!huggingface-cli login

In [None]:



# --- 3. Load the Gemma 2B LLM ---
# Configure BitsAndBytes for 4-bit quantization. This makes the model
# significantly smaller in memory and faster, especially on GPUs.
# 'load_in_4bit': Loads the model weights in 4-bit precision.
# 'bnb_4bit_use_double_quant': Applies a second quantization for even smaller memory footprint.
# 'bnb_4bit_quant_type': Specifies the quantization type (NormalFloat 4-bit).
# 'bnb_4bit_compute_dtype': Sets the data type for computation (bfloat16 is efficient on modern GPUs).
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load the tokenizer for Gemma 2B. This is crucial for correctly encoding and decoding text.
tokenizer = AutoTokenizer.from_pretrained(GEMMA_MODEL_NAME, trust_remote_code=True)
# Load the Gemma 2B model with the specified quantization configuration.
# 'device_map="auto"' automatically distributes the model across available GPUs.
model = AutoModelForCausalLM.from_pretrained(
    GEMMA_MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Gemma's tokenizer might not have a default pad_token. Setting it to eos_token
# helps with batching and generation, especially when max_length is involved.
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Create a HuggingFace pipeline for text generation.
# This simplifies the process of feeding input and getting generated text.
llm_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256, # Limits the maximum length of the generated response.
    do_sample=True,     # Enables sampling for more diverse responses.
    top_k=50,           # Considers only the top_k most likely tokens.
    temperature=0.7,    # Controls the randomness of the generation (lower = more deterministic).
    num_return_sequences=1, # Generates only one sequence.
    eos_token_id=tokenizer.eos_token_id, # Stops generation when the end-of-sequence token is met.
    pad_token_id=tokenizer.pad_token_id  # Uses the pad token for padding inputs.
)

In [None]:
# --- Custom LLM Wrapper for LangChain Compatibility ---
# LangChain's components expect specific input/output formats.
# This wrapper adapts the HuggingFace pipeline to be compatible with LangChain's Runnable interface.
class CustomHuggingFaceLLM:
    def __init__(self, pipeline, tokenizer):
        self.pipeline = pipeline
        self.tokenizer = tokenizer

    def invoke(self, prompt_value) -> str:
        # LangChain's ChatPromptTemplate outputs a PromptValue object,
        # which contains a list of messages (e.g., SystemMessage, HumanMessage).
        # Gemma instruction-tuned models expect a specific chat format,
        # which can be generated using the tokenizer's `apply_chat_template`.

        # Extract the single message from the PromptValue.
        # The ChatPromptTemplate with a simple template creates one HumanMessage.
        if not prompt_value.messages or len(prompt_value.messages) > 1:
             raise ValueError("Expected PromptValue with exactly one message.")

        message = prompt_value.messages[0]

        # Create a messages list suitable for the chat template, representing a single user turn.
        # We construct a list of dictionaries as expected by apply_chat_template,
        # with 'role' and 'content'.
        formatted_messages = [
            {"role": "user", "content": message.content}
        ]

        # `add_generation_prompt=True` adds the `<start_of_turn>model\n` token,
        # indicating that the model should start generating its response.
        formatted_prompt = self.tokenizer.apply_chat_template(
            formatted_messages, # Use the manually created list of messages
            tokenize=False, # We want the string, not token IDs yet.
            add_generation_prompt=True
        )

        # Run the text generation pipeline.
        result = self.pipeline(formatted_prompt)
        generated_text = result[0]['generated_text']

        # The pipeline output will contain the full `formatted_prompt` followed by the generated text.
        # We need to remove the input prompt part to get only the model's response.
        if generated_text.startswith(formatted_prompt):
            clean_response = generated_text[len(formatted_prompt):].strip()
        else:
            # Fallback in case the generated text doesn't start with the prompt (unlikely but good practice)
            clean_response = generated_text.strip()

        return clean_response

# Instantiate your custom LLM wrapper.
llm = CustomHuggingFaceLLM(llm_pipeline, tokenizer)

# --- 4. Define your Prompt Template ---
# This template guides the LLM on how to use the retrieved context and answer the question.
# It will be converted into SystemMessage and HumanMessage objects by ChatPromptTemplate,
# which are then formatted by `tokenizer.apply_chat_template` in the CustomHuggingFaceLLM.
template = """
You are a cute girl named Diya. You have all the information about Diya.
Use the following context to answer the user's question.

Context:
{context}

Question: {question}

Answer:
"""
prompt = ChatPromptTemplate.from_template(template)

# --- 5. Construct the RAG Chain ---
# This chain defines the flow of your RAG system:
# 1. `{"context": retriever, "question": RunnablePassthrough()}`:
#    - "context": The retriever fetches relevant documents based on the user's question.
#    - "question": The original user's question is passed through.
#    This creates a dictionary with 'context' and 'question' keys.
# 2. `| prompt`: The dictionary is passed to the ChatPromptTemplate, which formats it
#    into a series of messages (PromptValue object) based on the `template`.
# 3. `| RunnableLambda(llm.invoke)`: The PromptValue object is then passed to your
#    `CustomHuggingFaceLLM`'s `invoke` method, which uses Gemma to generate a response.
# 4. `| StrOutputParser()`: The final generated string from Gemma is parsed into a simple string.
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | RunnableLambda(llm.invoke) # Pass the PromptValue object directly to the custom LLM
    | StrOutputParser()
)

# --- 6. Chat Function ---
# A simple function to interact with your RAG chatbot.
def chat_with_gemma(query: str):
    # Invoke the RAG chain with the user's query.
    response = rag_chain.invoke(query)
    return response

# --- Example Usage ---
if __name__ == "__main__":
    print("Welcome to your personal RAG chatbot with Gemma 2B! Ask me anything about yourself.")
    print("Type 'exit' to quit.")

    while True:
        user_query = input("\nYou: ")
        if user_query.lower() == 'exit':
            print("Goodbye!")
            break
        # Get response from the chatbot.
        response = chat_with_gemma(user_query)

        print(f"Bot: {response}")