<a href="https://colab.research.google.com/github/devtank09/RAG-PDF-Chatbot/blob/main/Conversational_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain langchain-groq faiss-cpu pypdf sentence-transformers python-dotenv langchain-community

In [None]:
import os
from dotenv import load_dotenv
from getpass import getpass

from langchain_groq import ChatGroq
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.document_loaders import PyPDFLoader
from langchain.chains import create_retrieval_chain, create_history_aware_retriever
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

# Load environment variables from .env file
#load_dotenv()

def get_groq_api_key():
    """Gets the Groq API key from environment variables or user input."""
    api_key = os.environ.get("GROQ_API_KEY")
    if not api_key:
        print("Groq API key not found in environment variables.")
        api_key = getpass("Please enter your Groq API Key: ")
    return api_key

def process_pdf(file_path, embeddings):
    """Loads, splits, and creates a vector store from a PDF file."""
    print(f"Processing PDF: {file_path}")
    try:
        loader = PyPDFLoader(file_path)
        documents = loader.load()

        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        splits = text_splitter.split_documents(documents)

        if not splits:
            print("Warning: No text could be extracted from the PDF.")
            return None

        print(f"Type of splits: {type(splits)}")
        if splits:
            print(f"Type of first element in splits: {type(splits[0])}")
            print(f"First element in splits: {splits[0]}")

        vectorstore = FAISS.from_documents(documents=splits, embedding=embeddings)
        print("PDF processed successfully. Vector store created.")
        return vectorstore.as_retriever()
    except Exception as e:
        print(f"An error occurred while processing the PDF: {e}")
        return None

def create_rag_chain(llm, retriever):
    """Creates the conversational RAG chain."""
    # 1. Contextualize Question Chain
    contextualize_q_system_prompt = (
        "Given a chat history and the latest user question "
        "which might reference context in the chat history, "
        "formulate a standalone question which can be understood "
        "without the chat history. Do NOT answer the question, "
        "just reformulate it if needed and otherwise return it as is."
    )
    contextualize_q_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", contextualize_q_system_prompt),
            MessagesPlaceholder("chat_history"),
            ("human", "{input}"),
        ]
    )
    history_aware_retriever = create_history_aware_retriever(llm, retriever, contextualize_q_prompt)

    # 2. Answering Chain
    system_prompt = (
        "You are an assistant for question-answering tasks. "
        "Use the following pieces of retrieved context to answer "
        "the question. If you don't know the answer, just say that "
        "you don't know. Keep the answer detailed but relevant."
        "\n\n"
        "{context}"
    )
    qa_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            MessagesPlaceholder("chat_history"),
            ("human", "{input}"),
        ]
    )
    question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

    # 3. Combine chains
    rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

    return rag_chain

def main():
    """Main function to run the RAG with PDF application."""
    print("--- Welcome to RAG with PDF ---")

    # 1. Get API Key
    api_key = get_groq_api_key()
    if not api_key:
        print("Could not get API key. Exiting.")
        return

    # 2. Initialize LLM and Embeddings
    try:
        llm = ChatGroq(groq_api_key=api_key, model_name="Gemma2-9b-It")
        embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    except Exception as e:
        print(f"Error initializing models: {e}")
        return

    # 3. Get PDF and create retriever
    retriever = None
    while retriever is None:
        pdf_path = input("Please enter the full path to your PDF file: ").strip()
        if os.path.exists(pdf_path) and pdf_path.lower().endswith('.pdf'):
            retriever = process_pdf(pdf_path, embeddings)
        else:
            print("Invalid file path or not a PDF. Please try again.")

    # 4. Create the conversational chain
    rag_chain = create_rag_chain(llm, retriever)

    # 5. Setup chat history
    chat_history = ChatMessageHistory()

    conversational_rag_chain = RunnableWithMessageHistory(
        rag_chain,
        lambda session_id: chat_history, # We use the same history object for all sessions
        input_messages_key="input",
        history_messages_key="chat_history",
        output_messages_key="answer",
    )

    # 6. Start the conversation loop
    print("\nPDF loaded. You can now start asking questions.")
    print("Type 'exit' or 'quit' to end the chat.")

    while True:
        try:
            user_input = input("\nYour Question: ")
            if user_input.lower() in ["exit", "quit"]:
                print("Thank you for using the RAG assistant. Goodbye!")
                break

            if not user_input.strip():
                continue

            print("\nAssistant:")
            # The streaming response
            response_stream = conversational_rag_chain.stream(
                {"input": user_input},
                config={"configurable": {"session_id": "any_string"}} # session_id is a placeholder
            )

            full_response = ""
            for chunk in response_stream:
                if "answer" in chunk:
                    content = chunk.get("answer", "")
                    print(content, end="", flush=True)
                    full_response += content

        except (KeyboardInterrupt, EOFError):
            print("\nChat interrupted. Exiting.")
            break
        except Exception as e:
            print(f"\nAn error occurred: {e}")

if __name__ == "__main__":
    main()

--- Welcome to RAG with PDF ---
Groq API key not found in environment variables.
Please enter your Groq API Key: ··········
Please enter the full path to your PDF file: /content/Explainable machine-learning predictions for catalysts in CO2-assisted propane oxidative dehydrogenation.pdf
Processing PDF: /content/Explainable machine-learning predictions for catalysts in CO2-assisted propane oxidative dehydrogenation.pdf
Type of splits: <class 'list'>
Type of first element in splits: <class 'langchain_core.documents.base.Document'>
First element in splits: page_content='Explainable machine-learning predictions for
catalysts in CO2-assisted propane oxidative
dehydrogenation†
Hongyu Liu, ‡ab Kangyu Liu, ‡b Hairuo Zhu, a Weiqing Guoa and Yuming Li*a
Propylene is an important raw material in the chemical industry that needs new routes for its production to
meet the demand. The CO2-assisted oxidative dehydrogenation of propane (CO2-ODHP) represents an
ideal way to produce propylene and uses the